In [1]:
import os

# 1. Install OpenJDK 21 (if not already done in a previous cell)
!apt-get update -qq
!apt-get install -qq openjdk-21-jdk-headless

# 2. Verify where it landed (if needed)
!ls /usr/lib/jvm | grep 21

# 3. Point to JDK 21
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# 4. Install PySpark via pip (make sure this happens AFTER setting JAVA_HOME)
!pip install pyspark --quiet

# 5. Import and start Spark
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
      .master("local[*]")
      .appName("PySpark-LinearRegression_Advertising")
      .getOrCreate()
)


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package openjdk-21-jre-headless:amd64.
(Reading database ... 126102 files and directories currently installed.)
Preparing to unpack .../openjdk-21-jre-headless_21.0.7+6~us1-0ubuntu1~22.04_amd64.deb ...
Unpacking openjdk-21-jre-headless:amd64 (21.0.7+6~us1-0ubuntu1~22.04) ...
Selecting previously unselected package openjdk-21-jdk-headless:amd64.
Preparing to unpack .../openjdk-21-jdk-headless_21.0.7+6~us1-0ubuntu1~22.04_amd64.deb ...
Unpacking openjdk-21-jdk-headless:amd64 (21.0.7+6~us1-0ubuntu1~22.04) ...
Setting up openjdk-21-jre-headless:amd64 (21.0.7+6~us1-0ubuntu1~22.04) ...
update-alternatives: using /usr/lib/jvm/java-21-openjdk-amd64/bin/java to provide /usr/bin/java (java) in auto mode
update-alternatives: using /usr/lib/jvm/java-21-openjdk-amd64/bin/jpackage to

In [2]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# 1 Load Data set

In [3]:
# Upload data set into drive-> MyDrive->Colab Notebooks
df = spark.read.format('com.databricks.spark.csv').\
                               options(header='true', \
                               inferschema='true').load("/content/Advertising.csv",header=True);

In [4]:
df.show()

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
|  8.7| 48.9|     75.0|  7.2|
| 57.5| 32.8|     23.5| 11.8|
|120.2| 19.6|     11.6| 13.2|
|  8.6|  2.1|      1.0|  4.8|
|199.8|  2.6|     21.2| 10.6|
| 66.1|  5.8|     24.2|  8.6|
|214.7| 24.0|      4.0| 17.4|
| 23.8| 35.1|     65.9|  9.2|
| 97.5|  7.6|      7.2|  9.7|
|204.1| 32.9|     46.0| 19.0|
|195.4| 47.7|     52.9| 22.4|
| 67.8| 36.6|    114.0| 12.5|
|281.4| 39.6|     55.8| 24.4|
| 69.2| 20.5|     18.3| 11.3|
|147.3| 23.9|     19.1| 14.6|
+-----+-----+---------+-----+
only showing top 20 rows



In [5]:
df.printSchema()

root
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)



# 2. Convert data into feature

In [6]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])

# 3. Transform the dataset into DataFrame

In [7]:
#transformed = df.rdd.map(transData).toDF()
data= transData(df)
data.show(6)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]|  9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
|  [8.7,48.9,75.0]|  7.2|
+-----------------+-----+
only showing top 6 rows



# 4. Convert features data format and set up training and test data sets

In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures",\
                               maxCategories=4).fit(data)
# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = data.randomSplit([0.6, 0.4], seed = 218)

# 5. Fit linear regression model

In [9]:
# Fit elastic net model
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, lr])
# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)
lrmodel= model.stages[1]
lrmodel.coefficients
lrmodel.summary.meanAbsoluteError

1.3536052449934453

# 6. Make predictions

In [10]:
predictions = model.transform(testData)
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

+------------------+-----+----------------+
|        prediction|label|        features|
+------------------+-----+----------------+
| 6.098217842293607|  3.2|  [4.1,11.6,5.7]|
|   9.0453590533938|  5.3|  [5.4,29.9,9.4]|
| 8.742649922605892|  5.7|  [8.4,27.2,2.1]|
|4.7824788680237384|  4.8|   [8.6,2.1,1.0]|
|10.412867268349295|  7.3|[11.7,36.9,45.2]|
+------------------+-----+----------------+
only showing top 5 rows



# 7.  Evaluation

In [11]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 1.72421


In [12]:
y_true = predictions.select("label").toPandas()
y_pred = predictions.select("prediction").toPandas()

In [13]:
import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true,y_pred)
print(r2_score)

0.8989563084326142


# 8. Fit generalized linear regression model

In [14]:
# Import LinearRegression Class
from pyspark.ml.regression import GeneralizedLinearRegression
# Define LinearRegression Model
glr = GeneralizedLinearRegression(family="gaussian", link="identity",\
                                 maxIter = 10, regParam=0.3)
# Create pipeline
pipeline = Pipeline(stages=[featureIndexer,glr])
model = pipeline.fit(trainingData)
# Make predictions
predictions = model.transform(testData)
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)
# Evaluation
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

+------------------+-----+----------------+
|        prediction|label|        features|
+------------------+-----+----------------+
| 5.818378480845925|  3.2|  [4.1,11.6,5.7]|
|  8.86945269880158|  5.3|  [5.4,29.9,9.4]|
| 8.517309580526351|  5.7|  [8.4,27.2,2.1]|
| 4.437492260704728|  4.8|   [8.6,2.1,1.0]|
|10.466526621329647|  7.3|[11.7,36.9,45.2]|
+------------------+-----+----------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 1.66821


In [15]:
spark.stop()