In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator


spark = SparkSession.builder.getOrCreate()

num_samples = 1000
simulated_data = (spark.range(num_samples)
                  .selectExpr(
                      "id as id",
                      "(RAND() * 10) as feature1",
                      "(RAND() * 5) as feature2",
                      "(RAND() * 20 - 10) as label"))
                  
train_ratio = 0.8
test_ratio = 1 - train_ratio
training_data, testing_data = simulated_data.randomSplit(
    [train_ratio, test_ratio], seed=87)

assembler = VectorAssembler(
    inputCols=["feature1", "feature2"],
    outputCol="features")

scaler = StandardScaler(inputCol="features",
                        outputCol="scaledFeatures")

regression = LinearRegression(
    featuresCol="scaledFeatures",
    labelCol="label")

stages = [assembler, scaler, regression]

pipeline = Pipeline(stages=stages)

pipeline_model = pipeline.fit(training_data)

coefficients = pipeline_model.stages[-1].coefficients
intercept = pipeline_model.stages[-1].intercept
print("Coefficients: ", coefficients)
print("Intercept: ", intercept)

preds = pipeline_model.transform(testing_data)

preds.select("features", "label", "prediction").show(5)

evaluator = RegressionEvaluator(
    predictionCol="prediction", labelCol="label", metricName="rmse")
rmse = evaluator.evaluate(preds)
print("RMSE: ", rmse)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/14 08:40:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/14 08:40:32 WARN Instrumentation: [88a15e49] regParam is zero, which might cause numerical instability and overfitting.
24/11/14 08:40:32 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/11/14 08:40:32 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


Coefficients:  [-0.16184509573533976,0.0638702583231202]
Intercept:  0.07482663030754735
+--------------------+-------------------+--------------------+
|            features|              label|          prediction|
+--------------------+-------------------+--------------------+
|[5.46207738328965...| -5.833908718472404| -0.1587918689715602|
|[3.73488290844839...| -5.626003665765125|-0.01445992317734...|
|[2.63299818887171...|-1.3399181733082983|-0.06699945351753223|
|[1.04948037378680...|  5.705691021243135|  0.1066981082398636|
|[8.40206030459368...| -2.888887885238627|-0.21313216352948952|
+--------------------+-------------------+--------------------+
only showing top 5 rows

RMSE:  5.545916670596773
