# Read data from GCS and transform
* Data set used = [HVFHW June 2021] (https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/fhvhv_tripdata_2021-06.csv.gz)
* Running the Spark Master and one worker cluster locally

## Uploading data to GCS from local

In [9]:
# Execute this from the data_dir = '/home/sanyashireen/week_5_homework/data'
# NOTE : Don't add / after 06 bucket name as it creates a sub folder
# !gsutil -m cp -r pq/fhvhv/2021/06/ gs://dtc_data_lake_blissful-flames-375219/pq/fhvhv/2021/06
#sc.stop()

23/03/03 01:14:58 INFO SparkUI: Stopped Spark web UI at http://de-zoomcamp.us-central1-c.c.blissful-flames-375219.internal:4040
23/03/03 01:14:58 INFO StandaloneSchedulerBackend: Shutting down all executors
23/03/03 01:14:58 INFO CoarseGrainedSchedulerBackend$DriverEndpoint: Asking each executor to shut down
23/03/03 01:14:58 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
23/03/03 01:14:58 INFO MemoryStore: MemoryStore cleared
23/03/03 01:14:58 INFO BlockManager: BlockManager stopped
23/03/03 01:14:58 INFO BlockManagerMaster: BlockManagerMaster stopped
23/03/03 01:14:58 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
23/03/03 01:14:58 INFO SparkContext: Successfully stopped SparkContext


In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext


In [2]:
import os

# Setting the working directory
coding_dir = '/home/sanyashireen/week_5_homework/code'
data_dir = '/home/sanyashireen/week_5_homework/data'
parent_dir = '/home/sanyashireen/week_5_homework'

# Moving to the data directory
os.chdir(parent_dir)
print(f'Current directory: {os.getcwd()}')

Current directory: /home/sanyashireen/week_5_homework


In [3]:
credentials_location = '/home/sanyashireen/.google/credentials/google_credentials.json'

conf = SparkConf() \
    .setMaster("spark://de-zoomcamp.us-central1-c.c.blissful-flames-375219.internal:7077") \
    .setAppName('test') \
    .set("spark.jars", "./lib/gcs-connector-hadoop3-2.2.5.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

spark = SparkSession.builder \
                    .config(conf=sc.getConf()) \
                    .getOrCreate()

23/03/03 03:15:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [13]:
spark

## Extracting data from GCS bucket 

In [45]:
df = spark.read.parquet('gs://dtc_data_lake_blissful-flames-375219/pq/fhvhv/2021/06')
df_orig = df

In [46]:
df.count()

                                                                                

14961892

In [6]:
df.schema

StructType([StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', TimestampType(), True), StructField('dropoff_datetime', TimestampType(), True), StructField('PULocationID', IntegerType(), True), StructField('DOLocationID', IntegerType(), True), StructField('SR_Flag', StringType(), True), StructField('Affiliated_base_number', StringType(), True)])

In [7]:
df.show(5)

[Stage 4:>                                                          (0 + 1) / 1]

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B02889|2021-06-04 20:51:44|2021-06-04 21:10:12|         239|         158|      N|                B02889|
|              B02800|2021-06-04 15:50:15|2021-06-04 16:19:29|          75|         116|      N|                  null|
|              B02510|2021-06-02 21:03:38|2021-06-02 21:10:12|         167|         168|      N|                  null|
|              B02867|2021-06-02 12:51:57|2021-06-02 13:05:09|         151|         142|      N|                B02867|
|              B02869|2021-06-21 09:51:45|2021-06-21 10:09:17|         106|          65|      N|                B02869|
+--------------------+------------------

                                                                                

## Counting number of trips on June 15

### Method 1 using pyspark df

In [26]:
from pyspark.sql import functions as F
from pyspark.sql import types

In [56]:
df_sel = df.select('pickup_datetime') \
           .withColumn('new_pickup_datetime', F.to_date(df.pickup_datetime))

date_to_compare = '2021-06-15'
df_sel.filter(df_sel.new_pickup_datetime == date_to_compare) \
           .count()

                                                                                

452470

### Method 2 using pyspark df

In [58]:
df\
  .filter(F.col('pickup_datetime').between('2021-06-15 00:00:00', '2021-06-15 23:59:59')).count()

                                                                                

452470

### Method 3 using spark sql

In [8]:
df.registerTempTable('trips')



In [13]:
spark.sql("""
SELECT 
    COUNT(dispatching_base_num) as `Total Trips`
FROM
    trips
WHERE 
    pickup_datetime BETWEEN '2021-06-15 00:00:00' AND '2021-06-15 23:59:59'

""").show()



+-----------+
|Total Trips|
+-----------+
|     452470|
+-----------+



                                                                                

## Longest trip for each day

In [70]:
df \
  .select('dispatching_base_num', 'pickup_datetime', 'dropoff_datetime') \
  .withColumn('total_duration_hours', (F.round((F.unix_timestamp(df.dropoff_datetime) - F.unix_timestamp(df.pickup_datetime)) / 3600, scale=3))) \
  .sort(F.col('total_duration_hours').desc()) \
  .show(5)



+--------------------+-------------------+-------------------+--------------------+
|dispatching_base_num|    pickup_datetime|   dropoff_datetime|total_duration_hours|
+--------------------+-------------------+-------------------+--------------------+
|              B02872|2021-06-25 13:55:41|2021-06-28 08:48:25|              66.879|
|              B02765|2021-06-22 12:09:45|2021-06-23 13:42:44|               25.55|
|              B02879|2021-06-27 10:32:29|2021-06-28 06:31:20|              19.981|
|              B02800|2021-06-26 22:37:11|2021-06-27 16:49:01|              18.197|
|              B02682|2021-06-23 20:40:43|2021-06-24 13:08:44|              16.467|
+--------------------+-------------------+-------------------+--------------------+
only showing top 5 rows





## Most frequent pickup location zone

In [77]:
# Downloading the zones data and reading it into pysparkdf
!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv -P 'data/'

--2023-03-03 05:31:21--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/5a2cc2f5-b4cd-4584-9c62-a6ea97ed0e6a?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230303%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230303T053121Z&X-Amz-Expires=300&X-Amz-Signature=28bf104c68d1af4dd9324e18ec251a438bb0362af8331a95e50450e09125d973&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dtaxi_zone_lookup.csv&response-content-type=application%2Foctet-stream [following]
--2023-03-03 05:31:21--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/5a2cc2f5-b4cd-4584-9c62-a6e

In [81]:
import pandas as p
zones_file_name = 'taxi_zone_lookup.csv'
p_df = p.read_csv(f'{data_dir}/{zones_file_name}', nrows=100)
print(p_df.head(5))
spark.createDataFrame(p_df).schema

   LocationID        Borough                     Zone service_zone
0           1            EWR           Newark Airport          EWR
1           2         Queens              Jamaica Bay    Boro Zone
2           3          Bronx  Allerton/Pelham Gardens    Boro Zone
3           4      Manhattan            Alphabet City  Yellow Zone
4           5  Staten Island            Arden Heights    Boro Zone


StructType([StructField('LocationID', LongType(), True), StructField('Borough', StringType(), True), StructField('Zone', StringType(), True), StructField('service_zone', StringType(), True)])

In [85]:
schema = types.StructType([
                            types.StructField('LocationID', types.IntegerType(), True), 
                            types.StructField('Borough', types.StringType(), True), 
                            types.StructField('Zone', types.StringType(), True), 
                            types.StructField('service_zone', types.StringType(), True)])
zone_df = spark.read \
               .option("header", "true") \
               .schema(schema) \
               .csv(f'{data_dir}/{zones_file_name}')

In [86]:
zone_df.registerTempTable('zones')



In [89]:
# using with clause - better at speed
spark.sql("""

with fhv_data as(
SELECT 
    PULocationID, COUNT(1) as total_trips
FROM
    trips
GROUP BY
    1
ORDER BY
    total_trips DESC
LIMIT 1),

zones_data as(
SELECT * from zones
)

SELECT 
    fhv_data.PULocationID as LocationID,
    fhv_data.total_trips as total_trips,
    zones_data.Zone as zone
FROM
    fhv_data
    inner join zones_data 
    on fhv_data.PULocationID = zones_data.LocationID   
""").show()

                                                                                

+----------+-----------+-------------------+
|LocationID|total_trips|               zone|
+----------+-----------+-------------------+
|        61|     231279|Crown Heights North|
+----------+-----------+-------------------+



In [91]:
# simple
spark.sql("""
SELECT 
    t.PULocationID, 
    COUNT(1) as total_trips, 
    z.Zone as zone_name
FROM
    trips as t,
    zones as z
WHERE
    t.PULocationID=z.LocationID
GROUP BY
    1,3
ORDER BY
    total_trips DESC
LIMIT 1  
""").show()



+------------+-----------+-------------------+
|PULocationID|total_trips|          zone_name|
+------------+-----------+-------------------+
|          61|     231279|Crown Heights North|
+------------+-----------+-------------------+



                                                                                