In [34]:
! pip install pyspark



In [35]:
## IMPORT NECESSARY LIBRARIES
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

In [36]:
## CREATING A SPARK SESSION
# Start a Spark session
spark = SparkSession.builder.appName("LinearRegressionProject").getOrCreate()

In [37]:
## LOADING THE SALARY DATASET
data = spark.read.csv("Salary_Data.csv", header=True, inferSchema=True)



## DISPLAYING THE 1ST FEW ROWS OF THE DATAFRAME
data.show(5)

+---------------+-------+
|YearsExperience| Salary|
+---------------+-------+
|            1.1|39343.0|
|            1.3|46205.0|
|            1.5|37731.0|
|            2.0|43525.0|
|            2.2|39891.0|
+---------------+-------+
only showing top 5 rows



In [38]:
## DATA PREPROCESSING TO ASSEMBLE THE YEARS EXPERIENCE COLUMN INTO A FEATURE COLUMN FOR LINEAR REGRESSION
assembler = VectorAssembler(inputCols=["YearsExperience"], outputCol="feature")
assembled_data = assembler.transform(data)


In [39]:
## SHOW THE TRANSFORMED DATA WITH THE 'feature' COLUMN
assembled_data.select("YearsExperience", "Salary", "feature").show()

+---------------+-------+-------+
|YearsExperience| Salary|feature|
+---------------+-------+-------+
|            1.1|39343.0|  [1.1]|
|            1.3|46205.0|  [1.3]|
|            1.5|37731.0|  [1.5]|
|            2.0|43525.0|  [2.0]|
|            2.2|39891.0|  [2.2]|
|            2.9|56642.0|  [2.9]|
|            3.0|60150.0|  [3.0]|
|            3.2|54445.0|  [3.2]|
|            3.2|64445.0|  [3.2]|
|            3.7|57189.0|  [3.7]|
|            3.9|63218.0|  [3.9]|
|            4.0|55794.0|  [4.0]|
|            4.0|56957.0|  [4.0]|
|            4.1|57081.0|  [4.1]|
|            4.5|61111.0|  [4.5]|
|            4.9|67938.0|  [4.9]|
|            5.1|66029.0|  [5.1]|
|            5.3|83088.0|  [5.3]|
|            5.9|81363.0|  [5.9]|
|            6.0|93940.0|  [6.0]|
+---------------+-------+-------+
only showing top 20 rows



In [40]:
## SPLITTING THE DATA INTO TRAINING AND TESTING SET
train_data, test_data = assembled_data.randomSplit([0.8, 0.2], seed=1234)


In [41]:
## TRAINING THE LINEAR REGRESSION MODEL
lr = LinearRegression(featuresCol="feature", labelCol="Salary")

lr_model = lr.fit(train_data)

print("Coefficient is:", lr_model.coefficients)
print("Bias term is:", lr_model.intercept)

Coefficient is: [9645.93926054687]
Bias term is: 24815.60012255437


In [42]:
## MAKING PREDICTIONS ON THE TEST DATA AND DISPLAYING THEM
lr_predictions = lr_model.transform(test_data)

lr_predictions.select("YearsExperience", "Salary", "prediction").show()

+---------------+--------+------------------+
|YearsExperience|  Salary|        prediction|
+---------------+--------+------------------+
|            1.3| 46205.0|37355.321161265296|
|            2.2| 39891.0| 46036.66649575748|
|            7.1| 98273.0| 93301.76887243714|
|            8.7|109431.0|108735.27168931213|
|            9.0|105582.0| 111629.0534674762|
|            9.5|116969.0|116452.02309774962|
|            9.6|112635.0|117416.61702380431|
+---------------+--------+------------------+



In [43]:
## MODEL EVALUATION
evaluator = RegressionEvaluator(labelCol="Salary", predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(lr_predictions)
print("Root Mean Squared Error (RMSE) on test data:", rmse)

Root Mean Squared Error (RMSE) on test data: 5358.320514463983


In [44]:
## MODEL ADEQUACY USING R_SQUARE SCORE AND ADJUSTED R_SQUARED
evaluator.setMetricName("r2")
r2 = evaluator.evaluate(lr_predictions)
print("R-squared on test data:", r2)


n = assembled_data.count()
p = len(lr_model.coefficients)


adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print("Adjusted R-squared:", adj_r2)


R-squared on test data: 0.968385475310239
Adjusted R-squared: 0.9672563851427476
