# Random Forest

## Spark

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Apache_Spark_logo.svg/1280px-Apache_Spark_logo.svg.png" width="400">

**Hardware**: 20 nodes, r5.2xlarge (8 CPU, 64 GB RAM)

# Load data

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .config('spark.executor.memory', '36g')
         .getOrCreate())

In [2]:
import s3fs
import functools
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import DataFrame

In [3]:
# manually specify schema because inferSchema in read.csv is quite slow
schema = StructType([
    StructField('VendorID', DoubleType()),
    StructField('tpep_pickup_datetime', TimestampType()),
    StructField('tpep_dropoff_datetime', TimestampType()),
    StructField('passenger_count', DoubleType()),
    StructField('trip_distance', DoubleType()),
    StructField('RatecodeID', DoubleType()),
    StructField('store_and_fwd_flag', StringType()),
    StructField('PULocationID', DoubleType()),
    StructField('DOLocationID', DoubleType()),
    StructField('payment_type', DoubleType()),
    StructField('fare_amount', DoubleType()),
    StructField('extra', DoubleType()),
    StructField('mta_tax', DoubleType()),
    StructField('tip_amount', DoubleType()),
    StructField('tolls_amount', DoubleType()),
    StructField('improvement_surcharge', DoubleType()),
    StructField('total_amount', DoubleType()),
    StructField('congestion_surcharge', DoubleType()),
])

In [4]:
fs = s3fs.S3FileSystem(anon=True)
files = [f"s3://{x}" for x in fs.ls('s3://nyc-tlc/trip data/')
         if 'yellow' in x and ('2019' in x or '2018' in x or '2017' in x)]
cols = ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
        'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount',
        'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount']

def read_csv(path):
    df = spark.read.csv(path,
                        header=True,
                        schema=schema,
                        timestampFormat='yyyy-MM-dd HH:mm:ss',
                       )

    df = df.select(cols)

    return df

dfs = []
for tf in files:
    df = read_csv(tf)
    dfs.append(df)
taxi = functools.reduce(DataFrame.unionAll, dfs)

In [5]:
%%time
taxi.count()

CPU times: user 436 µs, sys: 3.64 ms, total: 4.07 ms
Wall time: 20.6 s


300700143

# Feature engineering

In [6]:
taxi = taxi.withColumn('pickup_weekday', F.dayofweek(taxi.tpep_pickup_datetime).cast(DoubleType()))
taxi = taxi.withColumn('pickup_hour', F.hour(taxi.tpep_pickup_datetime).cast(DoubleType()))
taxi = taxi.withColumn('pickup_minute', F.minute(taxi.tpep_pickup_datetime).cast(DoubleType()))
taxi = taxi.withColumn('pickup_week_hour', ((taxi.pickup_weekday * 24) + taxi.pickup_hour).cast(DoubleType()))
taxi = taxi.withColumn('store_and_fwd_flag', F.when(taxi.store_and_fwd_flag == 'Y', 1).otherwise(0))
# Spark ML expects "label" column for dependent variable
taxi = taxi.withColumn('label', taxi.total_amount)  
taxi = taxi.fillna(-1)

In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.pipeline import Pipeline

features = ['pickup_weekday', 'pickup_hour', 'pickup_minute',
            'pickup_week_hour', 'passenger_count', 'VendorID', 
            'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 
            'DOLocationID']

assembler = VectorAssembler(
    inputCols=features,
    outputCol='features',
)

pipeline = Pipeline(stages=[assembler])

In [8]:
%%time
assembler_fitted = pipeline.fit(taxi)
X = assembler_fitted.transform(taxi)

X.cache()
X.count()

CPU times: user 10.3 ms, sys: 440 µs, total: 10.8 ms
Wall time: 54.3 s


300700143

# Train random forest!

In [9]:
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(numTrees=100, maxDepth=10, seed=42)

In [11]:
%%time
fitted = rf.fit(X)

CPU times: user 255 ms, sys: 46.2 ms, total: 301 ms
Wall time: 36min 53s
