In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.ml.regression as mlreg
import pyspark.ml.tuning as tune
import pyspark.ml.evaluation as evals
import pyspark.ml.pipeline as pipe
import pyspark.ml.feature as feat

### I. Creating the dataset

In [2]:
spark = SparkSession.builder.appName("ML-Example").getOrCreate()
df = spark.read.csv("/home/jovyan/data/*", header=True)
print(f'There are {df.count()} rows in the dataset')

There are 1104418 rows in the dataset


In [3]:
df.show(5)

+--------+-------------------+-------------------+--------------------+--------------------+------------------+--------------------+-----------+-----------+
|Duration|         Start date|           End date|Start station number|       Start station|End station number|         End station|Bike number|Member type|
+--------+-------------------+-------------------+--------------------+--------------------+------------------+--------------------+-----------+-----------+
|     381|2017-04-01 00:00:22|2017-04-01 00:06:43|               31238|      14th & G St NW|             31202|      14th & R St NW|     W22257|     Member|
|     590|2017-04-01 00:02:02|2017-04-01 00:11:53|               31109|       7th & T St NW|             31278|      18th & R St NW|     W20006|     Member|
|    2938|2017-04-01 00:02:32|2017-04-01 00:51:30|               31289|Henry Bacon Dr & ...|             31238|      14th & G St NW|     W22225|     Casual|
|     380|2017-04-01 00:03:02|2017-04-01 00:09:23|        

In [8]:
fiveMins = 60 * 5
threeHours = 60 * 60 * 3

In [9]:
df = df.filter(df['Duration'] > fiveMins).filter(df['Duration'] < threeHours).count()

959105

### II. EDA 
- Already covered most of the EDA I wanted to in the other notebook

### III. Prediction Pipeline with PySpark

In [10]:
# data transformations

# - log1p of duration
# - convert "Start date" to a timestamp and extract: dayofweek, dayofyear, hour, minute
# - feature for station
# - drop unused columns: "Start date", "End date", "Start station", "End station number", "End station", "Duration", "Bike number"

df = df.withColumn("duration-log1p", F.log1p(df.Duration))
df = df.withColumn("Start date", F.to_timestamp('Start date', 'yyyy-MM-dd HH:mm:ss'))
df = df.withColumn("day_of_week", F.dayofweek("Start date"))
df = df.withColumn("month", F.dayofyear("Start date"))
df = df.withColumn("minute", F.minute("Start date"))
df = df.withColumn("hour", F.hour("Start date"))

df = df.drop("Start date", "End date", "Start station", "End station number", "End station", "Duration", "Bike number")

In [11]:
df.show(5)

+--------------------+-----------+------------------+-----------+-----+------+----+
|Start station number|Member type|    duration-log1p|day_of_week|month|minute|hour|
+--------------------+-----------+------------------+-----------+-----+------+----+
|               31238|     Member| 5.945420608606575|          7|   91|     0|   0|
|               31109|     Member|6.3818160174060985|          7|   91|     2|   0|
|               31289|     Casual| 7.985824666418917|          7|   91|     2|   0|
|               31121|     Member| 5.942799375126701|          7|   91|     3|   0|
|               31023|     Member| 6.049733455231958|          7|   91|     3|   0|
+--------------------+-----------+------------------+-----------+-----+------+----+
only showing top 5 rows



In [12]:
# encode the categorical features: 'Member type' & 'Start station number'

rider_indexer = feat.StringIndexer(inputCol='Member type', outputCol='rider_idx')
rider_encoder = feat.OneHotEncoder(inputCol='rider_idx', outputCol='rider_enc')
station_indexer = feat.StringIndexer(inputCol='Start station number', outputCol='station_idx')
station_encoder = feat.OneHotEncoder(inputCol='station_idx', outputCol='station_enc')

In [13]:
# create the 'label' column (ML libraries in pyspark expect the target feature to be called 'label')
df = df.withColumnRenamed('duration-log1p', 'label').drop('duration-log1p')

In [14]:
# VectorAssembler - 
vec = feat.VectorAssembler(
    inputCols=['rider_enc', 'station_enc', 'day_of_week', 'month', 'minute', 'hour'],
    outputCol='features'
)
# df = vec.transform(df)

In [15]:
# rf = mlreg.RandomForestRegressor(maxBins=347)
# pipeline = pipe.Pipeline(stages=[rider_indexer, rider_encoder, station_indexer, station_encoder, vec, rf])

In [16]:
# piped = pipeline.fit(df).transform(df)
# train, test = piped.randomSplit([.7, .3])

In [17]:
# train = train.select('features', 'label')
# test = test.select('features', 'label')

In [25]:
rf = mlreg.RandomForestRegressor()
pipeline = pipe.Pipeline(stages=[rider_indexer, rider_encoder, station_indexer, station_encoder, vec, rf])
evaluation = evals.RegressionEvaluator()
grid = tune.ParamGridBuilder()
grid = grid.addGrid(rf.maxDepth, [3, 5])
grid = grid.build()

In [26]:
cv = tune.CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=grid, 
    evaluator=evaluation,
    numFolds=3
)

In [27]:
train, test = df.randomSplit([.7, .3])
models = cv.fit(train)
best = models.bestModel

In [28]:
models.avgMetrics

[0.7546669035844837, 0.7394588444471769]

In [29]:
results = best.transform(test)

In [30]:
evaluation.evaluate(results)

0.7391634634226683