In [4]:
#spark setup
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("NYC Taxi Big Data Task1") \
    .getOrCreate()


In [3]:
#load Dataset
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/content/yellow_tripdata_2015-01.csv")
df.printSchema()
df.count()


root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- RateCodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)



67293

In [5]:
#Remove Null Values
from pyspark.sql.functions import col

df_clean = df.dropna()
df_clean.count()

67293

In [6]:
#Remove Invalid Records
df_clean = df_clean.filter(col("passenger_count") > 0)
df_clean = df_clean.filter(col("trip_distance") > 0)
df_clean = df_clean.filter(col("fare_amount") > 0)
df_clean = df_clean.filter(col("total_amount") > 0)


In [7]:
#Datetime Conversion
from pyspark.sql.functions import to_timestamp

df_clean = df_clean.withColumn(
    "tpep_pickup_datetime",
    to_timestamp("tpep_pickup_datetime")
).withColumn(
    "tpep_dropoff_datetime",
    to_timestamp("tpep_dropoff_datetime")
)


In [8]:
#Trip Duration Feature
from pyspark.sql.functions import unix_timestamp

df_clean = df_clean.withColumn(
    "trip_duration_minutes",
    (unix_timestamp("tpep_dropoff_datetime") -
     unix_timestamp("tpep_pickup_datetime")) / 60
)

df_clean = df_clean.filter(col("trip_duration_minutes") > 0)


In [9]:
#Cache
df_clean.cache()
df_clean.count()


66809