In [56]:
from pyspark import StorageLevel
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from functools import reduce
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.appName("M2") \
                            .master("local[*]") \
                            .config("spark.driver.memory", "4g") \
                            .config("spark.executor.memory", "4g") \
                            .getOrCreate()

25/02/04 14:26:42 WARN Utils: Your hostname, sumins-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.20.137 instead (on interface en0)
25/02/04 14:26:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/04 14:26:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/04 14:26:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
spark.sparkContext

In [48]:
df = spark.read.parquet("../data/*.parquet").dropna()

In [49]:
df = df.withColumn('tpep_pickup_datetime', F.col('tpep_pickup_datetime').cast(TimestampType()))
df = df.withColumn('tpep_dropoff_datetime', F.col('tpep_dropoff_datetime').cast(TimestampType()))

In [50]:
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



In [51]:
columns_to_check = ["passenger_count", "trip_distance"]
df_filtered = df.filter(reduce(lambda c1, c2: c1 & c2, [F.col(c) > 0 for c in columns_to_check])) \
                .filter(F.unix_timestamp("tpep_dropoff_datetime") - F.unix_timestamp("tpep_pickup_datetime") > 0)

In [52]:
high_value = df_filtered.agg(
    F.percentile_approx("trip_distance", 0.99, 100).alias("percentiles")
).collect()[0]["percentiles"]

                                                                                

In [53]:
df_filtered = df_filtered.filter(df_filtered.trip_distance < high_value)

In [54]:
weather_schema = StructType([
    StructField('Datetime', TimestampType(), True),
    StructField('Temperature', IntegerType(), True),
    StructField('Humidity', IntegerType(), True),
    StructField('Wind Speed', IntegerType(), True),
    StructField('Condition', StringType(), True)
])
weather = spark.read.csv("../data/weather.csv", header=True, schema=weather_schema, enforceSchema=False)
weather.printSchema()

root
 |-- Datetime: timestamp (nullable = true)
 |-- Temperature: integer (nullable = true)
 |-- Humidity: integer (nullable = true)
 |-- Wind Speed: integer (nullable = true)
 |-- Condition: string (nullable = true)



In [57]:
df_tr_1 = df_filtered.filter(df_filtered.passenger_count >= 1).persist(StorageLevel.MEMORY_AND_DISK)
df_tr_2 = df_tr_1.agg(
    F.count('*'),
    F.avg(F.col('trip_distance')),
    F.sum(F.col('total_amount'))
    )
df_tr_3 = df_tr_1.join(
    weather,
    (
        F.date_format(df_tr_1['tpep_pickup_datetime'], 'yyyy-MM-dd-HH')
        == F.date_format(weather['Datetime'], 'yyyy-MM-dd-HH')
     ),
    'inner'
)

In [58]:
df_tr_2.collect()

                                                                                

[Row(count(1)=32924293, avg(trip_distance)=3.573478439156939, sum(total_amount)=926045447.3687849)]

In [59]:
df_tr_3.coalesce(1).write.mode("overwrite").parquet("joined")

                                                                                