In [1]:
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)
il = sqlContext.read.format("org.apache.spark.sql.cassandra").load(table="inductive_loops", keyspace="dice")
il1 = il

In [151]:
il = il1

In [152]:
from pyspark.sql.functions import udf
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import DoubleType

hourTokens = udf(lambda time: float(time.hour), DoubleType())
il = il.withColumn('hour', hourTokens(il.updated))
minuteTokens = udf(lambda time: float(time.minute), DoubleType())
il = il.withColumn('minute', minuteTokens(il.updated))
weekTokens = udf(lambda time: float(time.weekday()), DoubleType())
il = il.withColumn('weekday', weekTokens(il.updated))

il = il.withColumn('stat_double', il['stat'].cast("double"))

indexer = StringIndexer(inputCol="id", outputCol="id_double").fit(il)
il = indexer.transform(il)

il = il.fillna({'lanedescription': ' '})
indexer = StringIndexer(inputCol="lanedescription", outputCol="lane").fit(il)
il = indexer.transform(il)

In [153]:
il = il.select('id_double','avgspeed', 'hour', 'minute', 'weekday', 'lane', 'stat_double').where('stat != 0')

In [154]:
from pyspark.ml.feature import VectorAssembler
not_use = ['avgspeed', 'stat_double']
assembler = VectorAssembler(inputCols=[x for x in il.columns if x not in not_use], outputCol='features')
il = assembler.transform(il)

In [155]:
train, test = il.randomSplit([0.7, 0.3], 1234)
train.first()

Row(id_double=13.0, avgspeed=0, hour=0.0, minute=25.0, weekday=1.0, lane=0.0, stat_double=6.0, features=DenseVector([13.0, 0.0, 25.0, 1.0, 0.0]))

In [126]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(maxDepth=30, maxBins=50, minInstancesPerNode=25, labelCol='avgspeed')
model = dt.fit(train) #rmse=0.495445

In [127]:
from pyspark.ml.evaluation import RegressionEvaluator
predictions = model.transform(test)
evaluator = RegressionEvaluator(labelCol="avgspeed", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 10.9611


In [128]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol='stat_double')
model = nb.fit(train)

In [158]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(maxDepth=30, maxBins=50, minInstancesPerNode=10, labelCol='stat_double')
model = dt.fit(train)

In [159]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

predictions = model.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol="stat_double", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.831538305223
