In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import isnull, avg, min, date_format
from operator import add

In [2]:
spark = SparkSession.builder \
    .master('spark://spark-master:7077') \
    .appName('W5M2') \
    .config('spark.executor.memory', '8gb') \
    .config("spark.executor.cores", "5") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/08/04 12:49:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
TLC_data_path = 'hdfs://spark-master:9000/user/hduser/hdfs_data/fhvhv_tripdata_2023-01.parquet'
weather_data_path = 'hdfs://spark-master:9000/user/hduser/hdfs_data/72505394728.csv'
output_dir_path = 'hdfs://spark-master:9000/user/spark_user/W5M2_output/'
tlc_ext='parquet'
weather_ext='csv'

# Data Loading
- The application should load the TLC Trip Record Data into a Spark DataFrame.
- The schema of the DataFrame should be inferred or explicitly defined.

In [4]:
def load_dataframe(spark_session, file_path, extension):
    if extension=="csv":
        df = spark_session.read.csv(file_path)
    elif extension=="parquet":
        df = spark_session.read.parquet(file_path)
    else:
        raise NotImplementedError
    return df

In [5]:
df = load_dataframe(spark, TLC_data_path, tlc_ext)
print("- The schema of the TLC DataFrame - \n", df.schema)
df.show(1, vertical=True)

                                                                                

- The schema of the TLC DataFrame - 
 StructType([StructField('hvfhs_license_num', StringType(), True), StructField('dispatching_base_num', StringType(), True), StructField('originating_base_num', StringType(), True), StructField('request_datetime', TimestampType(), True), StructField('on_scene_datetime', TimestampType(), True), StructField('pickup_datetime', TimestampType(), True), StructField('dropoff_datetime', TimestampType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('trip_miles', DoubleType(), True), StructField('trip_time', LongType(), True), StructField('base_passenger_fare', DoubleType(), True), StructField('tolls', DoubleType(), True), StructField('bcf', DoubleType(), True), StructField('sales_tax', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True), StructField('airport_fee', DoubleType(), True), StructField('tips', DoubleType(), True), StructField('driver_pay', DoubleType(), Tru



-RECORD 0-----------------------------------
 hvfhs_license_num    | HV0003              
 dispatching_base_num | B03404              
 originating_base_num | B03404              
 request_datetime     | 2023-01-01 00:18:06 
 on_scene_datetime    | 2023-01-01 00:19:24 
 pickup_datetime      | 2023-01-01 00:19:38 
 dropoff_datetime     | 2023-01-01 00:48:07 
 PULocationID         | 48                  
 DOLocationID         | 68                  
 trip_miles           | 0.94                
 trip_time            | 1709                
 base_passenger_fare  | 25.95               
 tolls                | 0.0                 
 bcf                  | 0.78                
 sales_tax            | 2.3                 
 congestion_surcharge | 2.75                
 airport_fee          | 0.0                 
 tips                 | 5.22                
 driver_pay           | 27.83               
 shared_request_flag  | N                   
 shared_match_flag    | N                   
 access_a_

                                                                                

# Data Cleaning
- The application should remove invalid or null entries.
- Filter out unrealistic values from the DataFrame.

In [6]:
df = df.na.drop('any').filter(df.driver_pay > 0).filter(df.base_passenger_fare > 0)

# Data Transformations
- Apply various transformations (filtering, aggregations, joins) to derive meaningful insights from the data.
- Ensure the transformations are designed to create a clear and optimized DAG.

In [7]:
df = df.withColumn("pickup_date", date_format(df.pickup_datetime,'yyyy-MM-dd'))
df = df.select(df.pickup_date, df.base_passenger_fare, df.trip_miles)
df.cache()

DataFrame[pickup_date: string, base_passenger_fare: double, trip_miles: double]

- Perform at least three different types of transformations on the DataFrame. Examples include:

    - Filtering: Filter the trips based on certain criteria (e.g., trips with more than one passenger).

    - Aggregations: Calculate the total number of trips, average trip distance, and total revenue generated for a specific time period.

    - Joins: If using multiple datasets, join the trip data with another relevant dataset (e.g., weather data to analyze the impact of weather on trip durations).

In [8]:
short_trip_df = df.filter(df.trip_miles < 10)

In [9]:
per_day_total_revenue_df = df.select(df.pickup_date, df.base_passenger_fare).groupBy(df.pickup_date).sum().orderBy(df.pickup_date)

In [10]:
per_day_avg_trip_miles_df = df.select(df.pickup_date, df.trip_miles).groupBy(df.pickup_date).mean().orderBy(df.pickup_date)

# Data Actions
- Execute actions to trigger the transformations and obtain results.
- Save the results to a specified storage format.

In [None]:
# Execute actions
print(short_trip_df.take(1))
print(per_day_total_revenue_df.take(1))
print(per_day_avg_trip_miles_df.take(1))



In [None]:
# Save results
df.coalesce(1).write.csv(output_dir_path+"df")
short_trip_df.coalesce(1).write.csv(output_dir_path+"short_trip_df")
per_day_total_revenue_df.coalesce(1).write.csv(output_dir_path+"per_day_total_revenue_df")
per_day_avg_trip_miles_df.coalesce(1).write.csv(output_dir_path+"per_day_avg_trip_miles_df")