<a href="https://colab.research.google.com/github/sasansharifipour/Spark_Class/blob/main/LinearRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

Linear Regression

In [3]:
from pyspark.ml.regression import LinearRegression

In [4]:
training = spark.read.format("libsvm").load("spark-3.0.1-bin-hadoop2.7/data/mllib/sample_linear_regression_data.txt")

In [6]:
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [7]:
lrModel = lr.fit(training)

In [8]:
print("Coefficients : %s" % str(lrModel.coefficients))
print("Intercept : %s" % str(lrModel.intercept))

Coefficients : [0.0,0.32292516677405936,-0.3438548034562218,1.9156017023458414,0.05288058680386263,0.765962720459771,0.0,-0.15105392669186682,-0.21587930360904642,0.22025369188813426]
Intercept : 0.1598936844239736


In [10]:
trainingSummary = lrModel.summary
print("numIterations : %d" % trainingSummary.totalIterations)
print("objectiveHistory : %s" % str(trainingSummary.objectiveHistory))

numIterations : 7
objectiveHistory : [0.49999999999999994, 0.4967620357443381, 0.4936361664340463, 0.4936351537897608, 0.4936351214177871, 0.49363512062528014, 0.4936351206216114]


In [11]:
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

+--------------------+
|           residuals|
+--------------------+
|  -9.889232683103197|
|  0.5533794340053554|
|  -5.204019455758823|
| -20.566686715507508|
|    -9.4497405180564|
|  -6.909112502719486|
|  -10.00431602969873|
|   2.062397807050484|
|  3.1117508432954772|
| -15.893608229419382|
|  -5.036284254673026|
|   6.483215876994333|
|  12.429497299109002|
|  -20.32003219007654|
| -2.0049838218725005|
| -17.867901734183793|
|   7.646455887420495|
| -2.2653482182417406|
|-0.10308920436195645|
|  -1.380034070385301|
+--------------------+
only showing top 20 rows

RMSE: 10.189077
r2: 0.022861


Gradient-Boosted Tree Regression

In [19]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [14]:
data = spark.read.format("libsvm").load("spark-3.0.1-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt")

In [15]:
#Automatically identify categorical features, and index them.
#Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

In [31]:
(trainingData, testData)= data.randomSplit([0.7, 0.3])

In [32]:
gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=10)

In [33]:
pipeline = Pipeline(stages=[featureIndexer, gbt])

In [34]:
model = pipeline.fit(trainingData)

In [35]:
predictions = model.transform(testData)

In [36]:
predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[95,96,97,12...|
|       0.0|  0.0|(692,[98,99,100,1...|
|       0.0|  0.0|(692,[121,122,123...|
|       0.0|  0.0|(692,[122,123,124...|
|       0.0|  0.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 5 rows



In [37]:
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

In [38]:
rmse = evaluator.evaluate(predictions)
print("RMSE on test Data = %g" % rmse)

RMSE on test Data = 0.246183


In [39]:
gbtModel = model.stages[1]
print(gbtModel)

GBTRegressionModel: uid=GBTRegressor_a8a946422ebd, numTrees=10, numFeatures=692


In [40]:
spark.stop()