<a href="https://colab.research.google.com/github/stevejj4/Apache-Spark/blob/main/RegressionAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# install pyspark
!pip install pyspark
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [5]:
# importing spark libraries
import findspark # for simple usage of pache spark
findspark.init()
from pyspark.sql import SparkSession

# import functions/clases for sparkml
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


In [6]:
# creating spark session
spark = SparkSession.builder.appName('mpg').getOrCreate()

In [1]:
# downloding the data file
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0231EN-SkillsNetwork/datasets/mpg.csv


--2024-08-16 05:00:50--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0231EN-SkillsNetwork/datasets/mpg.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13891 (14K) [text/csv]
Saving to: ‘mpg.csv’


2024-08-16 05:00:50 (293 MB/s) - ‘mpg.csv’ saved [13891/13891]



In [7]:
# loading the dataset
df = spark.read.csv('mpg.csv', header=True, inferSchema=True)

In [8]:
# printing the schema
df.printSchema()

root
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Engine Disp: double (nullable = true)
 |-- Horsepower: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Accelerate: double (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Origin: string (nullable = true)



In [9]:
# show the top 5 rows
df.show(5)

+----+---------+-----------+----------+------+----------+----+--------+
| MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|
+----+---------+-----------+----------+------+----------+----+--------+
|15.0|        8|      390.0|       190|  3850|       8.5|  70|American|
|21.0|        6|      199.0|        90|  2648|      15.0|  70|American|
|18.0|        6|      199.0|        97|  2774|      15.5|  70|American|
|16.0|        8|      304.0|       150|  3433|      12.0|  70|American|
|14.0|        8|      455.0|       225|  3086|      10.0|  70|American|
+----+---------+-----------+----------+------+----------+----+--------+
only showing top 5 rows



In [10]:
# To perform more data analysis

In [14]:
# Identify the label column and the input columns
# prepare feature vector
assembler = VectorAssembler(inputCols=["Cylinders", "Engine Disp", "Horsepower", "Weight", "Accelerate", "Year",], outputCol="features")
df_transformed = assembler.transform(df)
df_transformed.show(5)

+----+---------+-----------+----------+------+----------+----+--------+--------------------+
| MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|            features|
+----+---------+-----------+----------+------+----------+----+--------+--------------------+
|15.0|        8|      390.0|       190|  3850|       8.5|  70|American|[8.0,390.0,190.0,...|
|21.0|        6|      199.0|        90|  2648|      15.0|  70|American|[6.0,199.0,90.0,2...|
|18.0|        6|      199.0|        97|  2774|      15.5|  70|American|[6.0,199.0,97.0,2...|
|16.0|        8|      304.0|       150|  3433|      12.0|  70|American|[8.0,304.0,150.0,...|
|14.0|        8|      455.0|       225|  3086|      10.0|  70|American|[8.0,455.0,225.0,...|
+----+---------+-----------+----------+------+----------+----+--------+--------------------+
only showing top 5 rows



In [16]:
# displaying the assembled "features"
df_transformed.select('features', 'MPG').show(truncate=False)

+----------------------------------+----+
|features                          |MPG |
+----------------------------------+----+
|[8.0,390.0,190.0,3850.0,8.5,70.0] |15.0|
|[6.0,199.0,90.0,2648.0,15.0,70.0] |21.0|
|[6.0,199.0,97.0,2774.0,15.5,70.0] |18.0|
|[8.0,304.0,150.0,3433.0,12.0,70.0]|16.0|
|[8.0,455.0,225.0,3086.0,10.0,70.0]|14.0|
|[8.0,350.0,165.0,3693.0,11.5,70.0]|15.0|
|[8.0,307.0,130.0,3504.0,12.0,70.0]|18.0|
|[8.0,454.0,220.0,4354.0,9.0,70.0] |14.0|
|[8.0,400.0,150.0,3761.0,9.5,70.0] |15.0|
|[8.0,307.0,200.0,4376.0,15.0,70.0]|10.0|
|[8.0,383.0,170.0,3563.0,10.0,70.0]|15.0|
|[8.0,318.0,210.0,4382.0,13.5,70.0]|11.0|
|[8.0,360.0,215.0,4615.0,14.0,70.0]|10.0|
|[8.0,429.0,198.0,4341.0,10.0,70.0]|15.0|
|[6.0,200.0,85.0,2587.0,16.0,70.0] |21.0|
|[8.0,302.0,140.0,3449.0,10.5,70.0]|17.0|
|[8.0,304.0,193.0,4732.0,18.5,70.0]|9.0 |
|[8.0,340.0,160.0,3609.0,8.0,70.0] |14.0|
|[6.0,198.0,95.0,2833.0,15.5,70.0] |22.0|
|[8.0,440.0,215.0,4312.0,8.5,70.0] |14.0|
+---------------------------------

In [17]:
# split the data
train, test = df_transformed.randomSplit([0.7, 0.3], seed = 42)


In [18]:
# Building and training a linear regression model
lr = LinearRegression(featuresCol='features', labelCol='MPG')
model = lr.fit(train)

In [19]:
# Evaluating the model
forcast = model.transform(test)
forcast.show(5)

+----+---------+-----------+----------+------+----------+----+--------+--------------------+------------------+
| MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|            features|        prediction|
+----+---------+-----------+----------+------+----------+----+--------+--------------------+------------------+
|10.0|        8|      360.0|       215|  4615|      14.0|  70|American|[8.0,360.0,215.0,...|  6.68334402404847|
|11.0|        8|      429.0|       208|  4633|      11.0|  72|American|[8.0,429.0,208.0,...| 8.344953219723397|
|12.0|        8|      350.0|       180|  4499|      12.5|  73|American|[8.0,350.0,180.0,...|10.043420590827129|
|12.0|        8|      383.0|       180|  4955|      11.5|  71|American|[8.0,383.0,180.0,...| 5.252194346982428|
|13.0|        8|      302.0|       129|  3169|      12.0|  75|American|[8.0,302.0,129.0,...|21.473697417345075|
+----+---------+-----------+----------+------+----------+----+--------+--------------------+------------

In [20]:
# R square for to check goodness of fit
evaluator = RegressionEvaluator(labelCol='MPG', predictionCol='prediction', metricName='r2')
evaluator.evaluate(forcast)
#

0.8046190375720313

In [None]:
# Using RMSE
