In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark regression example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [2]:
df = spark.read.format('com.databricks.spark.csv').\
                       options(header='true', \
                       inferschema='true').\
            load("../data/Advertising.csv",header=True)

In [3]:
df.show(5,True)
df.printSchema()

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
+-----+-----+---------+-----+
only showing top 5 rows

root
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)



In [4]:
df.describe().show()

+-------+-----------------+------------------+------------------+------------------+
|summary|               TV|             Radio|         Newspaper|             Sales|
+-------+-----------------+------------------+------------------+------------------+
|  count|              200|               200|               200|               200|
|   mean|         147.0425|23.264000000000024|30.553999999999995|14.022500000000003|
| stddev|85.85423631490805|14.846809176168728| 21.77862083852283| 5.217456565710477|
|    min|              0.7|               0.0|               0.3|               1.6|
|    max|            296.4|              49.6|             114.0|              27.0|
+-------+-----------------+------------------+------------------+------------------+



In [5]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

# I provide two ways to build the features and labels

# method 1 (good for small feature):
#def transData(row):
#    return Row(label=row["Sales"],
#               features=Vectors.dense([row["TV"],
#                                       row["Radio"],
#                                       row["Newspaper"]]))

# Method 2 (good for large features):
def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])

In [6]:
transformed= transData(df)
transformed.show(5)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]|  9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
+-----------------+-----+
only showing top 5 rows



In [7]:
#convert data to dense vector
# convert the data to dense vector
def transData(data):
    return data.rdd.map(lambda r: [r[-1], Vectors.dense(r[:-1])]).\
           toDF(['label','features'])

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

data= transData(df)
data.show()

+-----+-----------------+
|label|         features|
+-----+-----------------+
| 22.1|[230.1,37.8,69.2]|
| 10.4| [44.5,39.3,45.1]|
|  9.3| [17.2,45.9,69.3]|
| 18.5|[151.5,41.3,58.5]|
| 12.9|[180.8,10.8,58.4]|
|  7.2|  [8.7,48.9,75.0]|
| 11.8| [57.5,32.8,23.5]|
| 13.2|[120.2,19.6,11.6]|
|  4.8|    [8.6,2.1,1.0]|
| 10.6| [199.8,2.6,21.2]|
|  8.6|  [66.1,5.8,24.2]|
| 17.4| [214.7,24.0,4.0]|
|  9.2| [23.8,35.1,65.9]|
|  9.7|   [97.5,7.6,7.2]|
| 19.0|[204.1,32.9,46.0]|
| 22.4|[195.4,47.7,52.9]|
| 12.5|[67.8,36.6,114.0]|
| 24.4|[281.4,39.6,55.8]|
| 11.3| [69.2,20.5,18.3]|
| 14.6|[147.3,23.9,19.1]|
+-----+-----------------+
only showing top 20 rows



In [8]:
#deal with categorical variables
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4
# distinct values are treated as continuous.

featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures",\
                               maxCategories=4).fit(transformed)

data = featureIndexer.transform(transformed)

In [9]:
# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = transformed.randomSplit([0.6, 0.4])

In [10]:
trainingData.show(5)
testData.show(5)

+---------------+-----+
|       features|label|
+---------------+-----+
| [0.7,39.6,8.7]|  1.6|
| [4.1,11.6,5.7]|  3.2|
| [5.4,29.9,9.4]|  5.3|
|[7.8,38.9,50.6]|  6.6|
| [8.4,27.2,2.1]|  5.7|
+---------------+-----+
only showing top 5 rows

+----------------+-----+
|        features|label|
+----------------+-----+
| [7.3,28.1,41.4]|  5.5|
| [8.7,48.9,75.0]|  7.2|
|[11.7,36.9,45.2]|  7.3|
| [13.1,0.4,25.6]|  5.3|
|[17.2,45.9,69.3]|  9.3|
+----------------+-----+
only showing top 5 rows



In [11]:
# Import LinearRegression class
from pyspark.ml.regression import GeneralizedLinearRegression

# Define LinearRegression algorithm
glr = GeneralizedLinearRegression(family="gaussian", link="identity",\
                                  maxIter=10, regParam=0.3)

In [12]:
# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, glr])

model = pipeline.fit(trainingData)

In [13]:
def modelsummary(model):
    import numpy as np
    print ("Note: the last rows are the information for Intercept")
    print ("##","-------------------------------------------------")
    print ("##","  Estimate   |   Std.Error | t Values  |  P-value")
    coef = np.append(list(model.coefficients),model.intercept)
    Summary=model.summary

    for i in range(len(Summary.pValues)):
        print ("##",'{:10.6f}'.format(coef[i]),\
        '{:10.6f}'.format(Summary.coefficientStandardErrors[i]),\
        '{:8.3f}'.format(Summary.tValues[i]),\
        '{:10.6f}'.format(Summary.pValues[i]))

    print ("##",'---')
#     print ("##","Mean squared error: % .6f" \
#            % Summary.meanSquaredError, ", RMSE: % .6f" \
#            % Summary.rootMeanSquaredError )
#     print ("##","Multiple R-squared: %f" % Summary.r2, ", \
#             Total iterations: %i"% Summary.totalIterations)

In [14]:
modelsummary(model.stages[-1])

Note: the last rows are the information for Intercept
## -------------------------------------------------
##   Estimate   |   Std.Error | t Values  |  P-value
##   0.042681   0.001892   22.564   0.000000
##   0.189469   0.011644   16.272   0.000000
##   0.000244   0.007758    0.032   0.974925
##   3.245604   0.421313    7.704   0.000000
## ---


In [15]:
# Make predictions.
predictions = model.transform(testData)

In [16]:
# Select example rows to display.
predictions.select("features","label","prediction").show(5)

+----------------+-----+------------------+
|        features|label|        prediction|
+----------------+-----+------------------+
| [7.3,28.1,41.4]|  5.5|  8.89136992228887|
| [8.7,48.9,75.0]|  7.2|12.900288264771042|
|[11.7,36.9,45.2]|  7.3|10.747422021751568|
| [13.1,0.4,25.6]|  5.3|3.8867707344668174|
|[17.2,45.9,69.3]|  9.3|12.693278371797728|
+----------------+-----+------------------+
only showing top 5 rows



In [17]:
#Evaluation
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import RegressionEvaluator
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 1.59258


In [18]:
y_true = predictions.select("label").toPandas()
y_pred = predictions.select("prediction").toPandas()

import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {0}'.format(r2_score))

r2_score: 0.9030687189774794


In [19]:
#Decision Tree Regression
from pyspark.ml.regression import DecisionTreeRegressor

# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

In [20]:
# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, dt])

model = pipeline.fit(trainingData)

In [21]:
# Make predictions.
predictions = model.transform(testData)

In [23]:
# Select example rows to display.
predictions.select("features","label","prediction").show(5)

+----------------+-----+----------+
|        features|label|prediction|
+----------------+-----+----------+
| [7.3,28.1,41.4]|  5.5|     7.425|
| [8.7,48.9,75.0]|  7.2|       8.7|
|[11.7,36.9,45.2]|  7.3|     7.425|
| [13.1,0.4,25.6]|  5.3|       6.2|
|[17.2,45.9,69.3]|  9.3|       8.7|
+----------------+-----+----------+
only showing top 5 rows



In [24]:
#Evaluation
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import RegressionEvaluator
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 1.39311


In [25]:
y_true = predictions.select("label").toPandas()
y_pred = predictions.select("prediction").toPandas()

import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {0}'.format(r2_score))

r2_score: 0.9258284860063273


In [26]:
#check importance of the features
model.stages[1].featureImportances

SparseVector(3, {0: 0.6052, 1: 0.3822, 2: 0.0126})

In [27]:
#Random Forest Regression
# Import LinearRegression class
from pyspark.ml.regression import RandomForestRegressor

# Define LinearRegression algorithm
rf = RandomForestRegressor() # featuresCol="indexedFeatures",numTrees=2, maxDepth=2, seed=42

In [28]:
# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])
model = pipeline.fit(trainingData)

In [29]:
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("features","label", "prediction").show(5)

+----------------+-----+------------------+
|        features|label|        prediction|
+----------------+-----+------------------+
| [7.3,28.1,41.4]|  5.5|10.590133669392182|
| [8.7,48.9,75.0]|  7.2|13.928263888888889|
|[11.7,36.9,45.2]|  7.3|10.834172130930645|
| [13.1,0.4,25.6]|  5.3| 8.416003599748988|
|[17.2,45.9,69.3]|  9.3|12.391805555555555|
+----------------+-----+------------------+
only showing top 5 rows



In [30]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 2.55544


In [31]:
import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {:4.3f}'.format(r2_score))

r2_score: 0.926


In [32]:
model.stages[-1].featureImportances

SparseVector(3, {0: 0.4518, 1: 0.3419, 2: 0.2062})

In [33]:
model.stages[-1].trees

[DecisionTreeRegressionModel (uid=dtr_66570b310ce0) of depth 5 with 47 nodes,
 DecisionTreeRegressionModel (uid=dtr_bf4e9b70d812) of depth 5 with 51 nodes,
 DecisionTreeRegressionModel (uid=dtr_8a8684692d28) of depth 5 with 51 nodes,
 DecisionTreeRegressionModel (uid=dtr_91363a1efb74) of depth 5 with 35 nodes,
 DecisionTreeRegressionModel (uid=dtr_935bff219b09) of depth 5 with 37 nodes,
 DecisionTreeRegressionModel (uid=dtr_a8661ecbfcbf) of depth 5 with 47 nodes,
 DecisionTreeRegressionModel (uid=dtr_4446cd7ab95b) of depth 5 with 41 nodes,
 DecisionTreeRegressionModel (uid=dtr_db147add49e2) of depth 5 with 47 nodes,
 DecisionTreeRegressionModel (uid=dtr_2177275ff459) of depth 5 with 41 nodes,
 DecisionTreeRegressionModel (uid=dtr_0cfb24ddc428) of depth 5 with 43 nodes,
 DecisionTreeRegressionModel (uid=dtr_ebe0ed866ff0) of depth 5 with 41 nodes,
 DecisionTreeRegressionModel (uid=dtr_a1b8f757049d) of depth 5 with 43 nodes,
 DecisionTreeRegressionModel (uid=dtr_ceb9fb380cf8) of depth 5 w

In [34]:
#Gradient Boosted Decision Trees
# Import LinearRegression class
from pyspark.ml.regression import GBTRegressor

# Define LinearRegression algorithm
rf = GBTRegressor() #numTrees=2, maxDepth=2, seed=42

In [35]:
# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])
model = pipeline.fit(trainingData)

In [36]:
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("features","label", "prediction").show(5)

+----------------+-----+------------------+
|        features|label|        prediction|
+----------------+-----+------------------+
| [7.3,28.1,41.4]|  5.5| 6.946822197156063|
| [8.7,48.9,75.0]|  7.2|7.4928311748441665|
|[11.7,36.9,45.2]|  7.3| 7.581659086163565|
| [13.1,0.4,25.6]|  5.3| 5.960418134307112|
|[17.2,45.9,69.3]|  9.3| 7.974055476837845|
+----------------+-----+------------------+
only showing top 5 rows



In [37]:
#Evaluation
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 1.2784


In [38]:
import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {:4.3f}'.format(r2_score))

r2_score: 0.926


In [39]:
model.stages[-1].featureImportances

SparseVector(3, {0: 0.3482, 1: 0.3294, 2: 0.3224})