In [1]:
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)
bt = sqlContext.read.format("org.apache.spark.sql.cassandra").load(table="bt_sensors", keyspace="dice")

In [2]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, DateType

timestampTokens = udf(lambda a, b: a + (b - a)/2, DateType())
bt = bt.withColumn('timestamp', timestampTokens(bt.timestampfrom, bt.timestampto))
hourTokens = udf(lambda time: time.hour, IntegerType())
bt = bt.withColumn('hour', hourTokens(bt.timestamp))
minuteTokens = udf(lambda time: time.minute, IntegerType())
bt = bt.withColumn('minute', minuteTokens(bt.timestamp))
weekTokens = udf(lambda time: time.weekday(), IntegerType())
bt = bt.withColumn('weekday', weekTokens(bt.timestamp))

In [74]:
bt_hourly = bt.groupBy('frombtid', 'tobtid', 'hour').agg({"avgspeed": "mean", 'count':'mean'})

In [77]:
sorted(bt_hourly.where('avg(count) > 10').collect(), key=lambda x: x['avg(avgspeed)'])

[Row(frombtid=u'BTR0207', tobtid=u'BTR0204', hour=14, avg(count)=18.53846153846154, avg(avgspeed)=14.159915337195763),
 Row(frombtid=u'BTR0207', tobtid=u'BTR0204', hour=13, avg(count)=14.153846153846153, avg(avgspeed)=15.002568061535175),
 Row(frombtid=u'BTR0207', tobtid=u'BTR0204', hour=10, avg(count)=10.083333333333334, avg(avgspeed)=20.18646256128947),
 Row(frombtid=u'BTR0207', tobtid=u'BTR0204', hour=15, avg(count)=15.38888888888889, avg(avgspeed)=20.244712670644123),
 Row(frombtid=u'BTR0207', tobtid=u'BTR0204', hour=6, avg(count)=12.384615384615385, avg(avgspeed)=20.269143471351036),
 Row(frombtid=u'BTR0204', tobtid=u'BTR0207', hour=14, avg(count)=22.615384615384617, avg(avgspeed)=20.727346640366775),
 Row(frombtid=u'BTR0207', tobtid=u'BTR0204', hour=8, avg(count)=12.25, avg(avgspeed)=21.02135866880417),
 Row(frombtid=u'BTR0202', tobtid=u'BTR0201', hour=5, avg(count)=12.11111111111111, avg(avgspeed)=21.2668399810791),
 Row(frombtid=u'BTR0204', tobtid=u'BTR0207', hour=13, avg(count

In [3]:
bt = bt.where('count > 10')

In [24]:
from pyspark.ml.feature import RFormula

formula = RFormula(formula="avgspeed ~ frombtid + tobtid + frombtid : tobtid + hour + weekday + hour : weekday")
data = formula.fit(bt).transform(bt)
data = data.select('avgspeed', 'features')

In [25]:
train, test = data.randomSplit([0.7, 0.3], 1234)

In [26]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

lr = LinearRegression(labelCol='avgspeed')
model = lr.fit(train) #rmse=0.921361

Py4JJavaError: An error occurred while calling o537.fit.
: java.lang.AssertionError: assertion failed: lapack.dppsv returned 23.
	at scala.Predef$.assert(Predef.scala:170)
	at org.apache.spark.mllib.linalg.CholeskyDecomposition$.solve(CholeskyDecomposition.scala:40)
	at org.apache.spark.ml.optim.WeightedLeastSquares.fit(WeightedLeastSquares.scala:140)
	at org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:180)
	at org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:70)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:90)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)


In [15]:
from pyspark.ml.regression import GeneralizedLinearRegression

glr = GeneralizedLinearRegression(family="poisson", link="log", regParam=1, labelCol='avgspeed')
model = glr.fit(train) #rmse=0.91962

Py4JJavaError: An error occurred while calling o378.fit.
: java.lang.AssertionError: assertion failed: lapack.dppsv returned 23.
	at scala.Predef$.assert(Predef.scala:170)
	at org.apache.spark.mllib.linalg.CholeskyDecomposition$.solve(CholeskyDecomposition.scala:40)
	at org.apache.spark.ml.optim.WeightedLeastSquares.fit(WeightedLeastSquares.scala:140)
	at org.apache.spark.ml.regression.GeneralizedLinearRegression$FamilyAndLink.initialize(GeneralizedLinearRegression.scala:340)
	at org.apache.spark.ml.regression.GeneralizedLinearRegression.train(GeneralizedLinearRegression.scala:275)
	at org.apache.spark.ml.regression.GeneralizedLinearRegression.train(GeneralizedLinearRegression.scala:139)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:90)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)


In [11]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(maxDepth=30, maxBins=50, minInstancesPerNode=5, labelCol='avgspeed')
model = dt.fit(train) #rmse=0.507627

In [13]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(maxDepth=30, maxBins=50, minInstancesPerNode=4, numTrees=40, labelCol='avgspeed')
model = rf.fit(train) #rmse=0.490043

In [14]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(labelCol="avgspeed", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 5.13024
