In [33]:
import pyspark
from pyspark.sql import SparkSession

In [34]:
spark = SparkSession.builder\
                    .master("local[*]")\
                    .appName("SparkSQL")\
                    .getOrCreate()

In [35]:
df_green = spark.read.parquet("data/raw/green/*/*/*")

In [36]:
df_green.show(5)

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2019-12-18 15:52:30|  2019-12-18 15:54:39|                 N|       1.0|         264|         264|            5.0|          0.0|        3.5|  0.5|    0.

In [37]:
df_green.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- lpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- lpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: integer (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: double (nullable = true)
 |-- trip_type: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [38]:
df_green.count(), len(df_green.columns)

(2802931, 20)

In [39]:
df_green.rdd.getNumPartitions()

4

In [40]:
df_yellow = spark.read.parquet("data/raw/yellow/*/*/*")

In [41]:
df_yellow.schema

StructType([StructField('VendorID', LongType(), True), StructField('tpep_pickup_datetime', TimestampNTZType(), True), StructField('tpep_dropoff_datetime', TimestampNTZType(), True), StructField('passenger_count', DoubleType(), True), StructField('trip_distance', DoubleType(), True), StructField('RatecodeID', DoubleType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('payment_type', LongType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True), StructField('airport_fee', IntegerType(), True)])

In [42]:
df_yellow.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: integer (nullable = true)



In [43]:
df_yellow.count(), len(df_green.columns)

(55553400, 20)

Schemas are alittle different, so we will need to do some data cleaning.

In [44]:
set(df_yellow.columns) & set(df_green.columns)

{'DOLocationID',
 'PULocationID',
 'RatecodeID',
 'VendorID',
 'congestion_surcharge',
 'extra',
 'fare_amount',
 'improvement_surcharge',
 'mta_tax',
 'passenger_count',
 'payment_type',
 'store_and_fwd_flag',
 'tip_amount',
 'tolls_amount',
 'total_amount',
 'trip_distance'}

In [45]:
len(set(df_yellow.columns) & set(df_green.columns))

16

Only 16 columns are same in both datasets

{'DOLocationID',
 'PULocationID',
 'RatecodeID',
 'VendorID',
 'congestion_surcharge',
 'extra',
 'fare_amount',
 'improvement_surcharge',
 'mta_tax',
 'passenger_count',
 'payment_type',
 'store_and_fwd_flag',
 'tip_amount',
 'tolls_amount',
 'total_amount',
 'trip_distance'}

In [46]:
set(df_yellow.columns).symmetric_difference(df_green.columns)

{'airport_fee',
 'ehail_fee',
 'lpep_dropoff_datetime',
 'lpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'tpep_pickup_datetime',
 'trip_type'}

In [47]:
# Define the set of column names
column_names = {'airport_fee', 'ehail_fee', 'lpep_dropoff_datetime', 'lpep_pickup_datetime', 'tpep_dropoff_datetime', 'tpep_pickup_datetime', 'trip_type'}

# Check which dataframe each column comes from
for column in column_names:
    if column in df_yellow.columns:
        print(f"{column} -> df_yellow")
    if column in df_green.columns:
        print(f"{column} -> df_green")

trip_type -> df_green
airport_fee -> df_yellow
tpep_dropoff_datetime -> df_yellow
tpep_pickup_datetime -> df_yellow
ehail_fee -> df_green
lpep_dropoff_datetime -> df_green
lpep_pickup_datetime -> df_green


In [54]:
df_green = df_green.withColumnRenamed("lpep_dropoff_datetime", "dropoff_datetime")\
        .withColumnRenamed("lpep_pickup_datetime", "pickup_datetime")

In [55]:
df_yellow = df_yellow.withColumnRenamed("tpep_dropoff_datetime", "dropoff_datetime")\
        .withColumnRenamed("tpep_pickup_datetime", "pickup_datetime")

In [58]:
column_names = set(df_yellow.columns).symmetric_difference(df_green.columns)
# Check which dataframe each column comes from
for column in column_names:
    if column in df_yellow.columns:
        print(f"{column} -> df_yellow")
    if column in df_green.columns:
        print(f"{column} -> df_green")

airport_fee -> df_yellow
ehail_fee -> df_green
trip_type -> df_green


In [60]:
# Common columns in both datasets
set(df_yellow.columns) & set(df_green.columns)

{'DOLocationID',
 'PULocationID',
 'RatecodeID',
 'VendorID',
 'congestion_surcharge',
 'dropoff_datetime',
 'extra',
 'fare_amount',
 'improvement_surcharge',
 'mta_tax',
 'passenger_count',
 'payment_type',
 'pickup_datetime',
 'store_and_fwd_flag',
 'tip_amount',
 'tolls_amount',
 'total_amount',
 'trip_distance'}

In [61]:
from pyspark.sql.functions import lit
df_green = df_green.withColumn("source", lit("green"))
df_yellow = df_yellow.withColumn("source", lit("yellow"))

In [62]:
df_green.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: integer (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: double (nullable = true)
 |-- trip_type: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- source: string (nullable = false)



In [63]:
df_yellow.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: integer (nullable = true)
 |-- source: string (nullable = false)



In [65]:
common_columns = set(df_yellow.columns) & set(df_green.columns)
df_yellow_common = df_yellow.select(*common_columns)

In [66]:
df_green_common = df_yellow.select(*common_columns)