### Isotonic regression

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("isotone").getOrCreate()

In [3]:
from pyspark.ml.regression import IsotonicRegression

In [5]:
df = spark.read.format("libsvm").load("newd/sample_libsvm.txt")

In [7]:
model = IsotonicRegression().fit(df)

In [8]:
model.transform(df).show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|(692,[127,128,129...|       0.0|
|  1.0|(692,[158,159,160...|       0.0|
|  1.0|(692,[124,125,126...|       0.0|
|  1.0|(692,[152,153,154...|       0.0|
|  1.0|(692,[151,152,153...|       0.0|
|  0.0|(692,[129,130,131...|       0.0|
|  1.0|(692,[158,159,160...|       0.0|
|  1.0|(692,[99,100,101,...|       0.0|
|  0.0|(692,[154,155,156...|       0.0|
|  0.0|(692,[127,128,129...|       0.0|
|  1.0|(692,[154,155,156...|       0.0|
|  0.0|(692,[153,154,155...|       0.0|
|  0.0|(692,[151,152,153...|       0.0|
|  1.0|(692,[129,130,131...|       0.0|
|  0.0|(692,[154,155,156...|       0.0|
|  1.0|(692,[150,151,152...|       0.0|
|  0.0|(692,[124,125,126...|       0.0|
|  0.0|(692,[152,153,154...|       0.0|
|  1.0|(692,[97,98,99,12...|       0.0|
|  1.0|(692,[124,125,126...|       0.0|
+-----+--------------------+----------+
only showing top 20 rows



In [9]:
print("Boundaries in increasing order: %s\n" % str(model.boundaries))
print("Predictions associated with the boundaries: %s\n" % str(model.predictions))

Boundaries in increasing order: [0.0,0.0]

Predictions associated with the boundaries: [0.0,1.0]



### Factorization machines regressor

In [10]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import FMRegressor
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.evaluation import RegressionEvaluator

In [13]:
data = spark.read.format("libsvm").load("newd/sample_libsvm.txt")

In [14]:
featureScaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures").fit(data)


In [15]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [16]:
fm = FMRegressor(featuresCol="scaledFeatures", stepSize=0.001)

In [17]:
pipeline = Pipeline(stages=[featureScaler, fm])

In [18]:
model = pipeline.fit(trainingData)

In [19]:
predictions = model.transform(testData)

In [20]:
predictions.select("prediction", "label", "features").show(5)

+--------------------+-----+--------------------+
|          prediction|label|            features|
+--------------------+-----+--------------------+
|-0.01886395284396556|  0.0|(692,[98,99,100,1...|
|-0.00386795088229...|  0.0|(692,[121,122,123...|
| 0.09396971305869348|  0.0|(692,[123,124,125...|
| 0.09483501910400573|  0.0|(692,[124,125,126...|
| -0.0841970962179205|  0.0|(692,[124,125,126...|
+--------------------+-----+--------------------+
only showing top 5 rows



In [21]:
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")

In [22]:
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

fmModel = model.stages[1]
print("Factors: " + str(fmModel.factors))
print("Linear: " + str(fmModel.linear))
print("Intercept: " + str(fmModel.intercept))

Root Mean Squared Error (RMSE) on test data = 0.114244
Factors: DenseMatrix([[ 4.65697140e-03,  2.44200958e-03,  5.69742241e-03, ...,
              -2.61749192e-03,  8.66223783e-03,  7.80579087e-03],
             [ 1.03268410e-03, -5.41467329e-05,  2.01416099e-02, ...,
               9.51492996e-03,  1.32786185e-03,  1.49833185e-02],
             [ 2.10808670e-03, -5.55590856e-03, -4.71632669e-03, ...,
              -1.59468272e-02, -2.20016581e-02,  7.12767925e-03],
             ...,
             [-2.09762408e-02, -1.34831057e-02,  7.60570353e-03, ...,
               2.41005748e-02,  1.01676533e-02, -3.31041699e-03],
             [-7.35433952e-03,  1.57924118e-02,  2.22108335e-03, ...,
               1.26123368e-03,  1.61491501e-02, -3.37306435e-03],
             [ 9.28252146e-03,  7.53153697e-03, -1.34392142e-02, ...,
               2.06166303e-02,  8.70636907e-03,  8.19398363e-03]])
Linear: [0.0027371271198711024,0.0027371271198711024,0.0027371271198711024,0.0027371271198711024,0.00