In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 44 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 57.0 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=de267821169bbd39384324c1d3eb24292272e2e617a3ea7fdf6e801cab3625f8
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [3]:
sc = spark.sparkContext

In [4]:
from pyspark.ml.regression import LinearRegression
training = spark.read.format("libsvm")\
    .load("sample_linear_regression_data.txt")

In [5]:
training.show(50, truncate=False)

+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label              |features                                                                                                                                                                                                                                |
+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|-9.490009878824548 |(10,[0,1,2,3,4,5,6,7,8,9],[0.4551273600657362,0.36644694351969087,-0.38256108933468047,-0.4458430198517267,0.33109790358914726,0.8067445293443565,-0.2624341731773887,-0.44850386111659524,-0.07269284838169332,0.5658

In [6]:
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(training)

In [8]:
print("coefficients: %s" %str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

coefficients: [0.0,0.3229251667740594,-0.3438548034562219,1.915601702345841,0.05288058680386255,0.765962720459771,0.0,-0.15105392669186676,-0.21587930360904645,0.2202536918881343]
Intercept: 0.15989368442397356


In [10]:
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RRMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)


numIterations: 6
objectiveHistory: [0.49999999999999994, 0.4967620357443381, 0.49363616643404634, 0.4936351537897608, 0.4936351214177871, 0.49363512062528014, 0.4936351206216114]
+--------------------+
|           residuals|
+--------------------+
|  -9.889232683103197|
|  0.5533794340053553|
|  -5.204019455758822|
| -20.566686715507508|
|    -9.4497405180564|
|  -6.909112502719487|
|  -10.00431602969873|
|  2.0623978070504845|
|  3.1117508432954772|
|  -15.89360822941938|
|  -5.036284254673026|
|  6.4832158769943335|
|  12.429497299109002|
|  -20.32003219007654|
|    -2.0049838218725|
| -17.867901734183793|
|   7.646455887420495|
| -2.2653482182417406|
|-0.10308920436195645|
|  -1.380034070385301|
+--------------------+
only showing top 20 rows

RRMSE: 10.189077
r2: 0.022861


In [15]:
from pyspark.ml.linalg import Vectors
df = spark.createDataFrame([
    (1.0, 2.0, Vectors.dense(1.0)),
    (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"])

In [16]:
df.printSchema()

root
 |-- label: double (nullable = true)
 |-- weight: double (nullable = true)
 |-- features: vector (nullable = true)



In [17]:
df.show()

+-----+------+---------+
|label|weight| features|
+-----+------+---------+
|  1.0|   2.0|    [1.0]|
|  0.0|   2.0|(1,[],[])|
+-----+------+---------+



In [18]:
lr = LinearRegression(regParam=0.1, solver="normal", weightCol="weight")
lr.setMaxIter(5)

LinearRegression_7b48e0ff5331

In [19]:
lr.getMaxIter()

5

In [20]:
lr.getRegParam()

0.1

In [21]:
model = lr.fit(df)

In [22]:
model.setFeaturesCol("features")
model.setPredictionCol("newPrediction")

LinearRegressionModel: uid=LinearRegression_7b48e0ff5331, numFeatures=1

In [23]:
test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"])

In [24]:
test0.show()

+--------+
|features|
+--------+
|  [-1.0]|
+--------+



In [25]:
model.intercept

0.08333333333333345

In [26]:
model.coefficients[0]

0.8333333333333333

In [27]:
model.predict(test0.head().features)

-0.7499999999999998

In [30]:
test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])

In [31]:
model.transform(test1).head().newPrediction

0.9166666666666667