In [2]:
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)

In [3]:
counters = sqlContext.read.format("org.apache.spark.sql.cassandra").load(table="counters", keyspace="dice")
counters.cache()

DataFrame[id: string, modified: timestamp, contentname: string, crsid: string, description: string, icon: string, stevci_cestaopis: string, stevci_gap: float, stevci_hit: int, stevci_lokacijaopis: string, stevci_pasopis: string, stevci_smeropis: string, stevci_stat: int, stevci_statopis: string, stevci_stev: int, title: string, x: float, x_wgs: float, y: float, y_wgs: float]

In [4]:
counters.select(counters.stevci_statopis, counters.stevci_stat).distinct().show(truncate=False)

+---------------------+-----------+
|stevci_statopis      |stevci_stat|
+---------------------+-----------+
|Zgoščen promet       |3          |
|Ni prometa           |6          |
|Gost promet          |4          |
|Gost promet z zastoji|5          |
|Povečan promet       |2          |
|Normalen promet      |1          |
+---------------------+-----------+



In [4]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, FloatType

hourTokens = udf(lambda time: time.hour, IntegerType())
counters = counters.withColumn('hour', hourTokens(counters.modified))
weekTokens = udf(lambda time: time.weekday(), IntegerType())
counters = counters.withColumn('weekday', weekTokens(counters.modified))

In [14]:
map_stat = {
    u'Ni prometa': 1.0,
    u'Normalen promet': 2.0,
    u'Pove\u010dan promet': 3.0,
    u'Zgo\u0161\u010den promet': 4.0,
    u'Gost promet': 5.0,
    u'Gost promet z zastoji': 6.0
}
statTokens = udf(lambda stat: map_stat[stat], FloatType())
counters = counters.withColumn('stat', statTokens(counters.stevci_statopis))

In [15]:
counters.select(counters.stevci_statopis, 'stat').distinct().show()

+--------------------+----+
|     stevci_statopis|stat|
+--------------------+----+
|          Ni prometa| 1.0|
|      Zgoščen promet| 4.0|
|      Povečan promet| 3.0|
|     Normalen promet| 2.0|
|         Gost promet| 5.0|
|Gost promet z zas...| 6.0|
+--------------------+----+



In [16]:
from pyspark.ml.feature import RFormula

formula = RFormula(formula="stat ~ id + hour + weekday + hour : weekday + id : hour + weekday")
data = formula.fit(counters).transform(counters)
data = data.select('stat', 'features')
#data.show(truncate=False)
#data.printSchema()

In [17]:
train, test = data.randomSplit([0.7, 0.3], 1234)

In [18]:
train.select('stat').distinct().show()

+----+
|stat|
+----+
| 5.0|
| 2.0|
| 3.0|
| 1.0|
| 6.0|
| 4.0|
+----+



In [49]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

lr = LinearRegression(regParam=0.001, labelCol='stat')
model = lr.fit(train) #rmse=0.921361

In [72]:
from pyspark.ml.regression import GeneralizedLinearRegression

glr = GeneralizedLinearRegression(family="poisson", link="log", regParam=0.0001, labelCol='stat')
model = glr.fit(train) #rmse=0.91962

In [94]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(maxDepth=30, maxBins=50, minInstancesPerNode=5, labelCol='stat')
model = dt.fit(train) #rmse=0.507627

In [118]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(maxDepth=30, maxBins=50, minInstancesPerNode=4, numTrees=40, labelCol='stat')
model = rf.fit(train) #rmse=0.490043

In [119]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(labelCol="stat", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.490043
