In [8]:
# Installing Required Libraries
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://mirrors.viethosting.com/apache/spark/spark-2.4.6/spark-2.4.6-bin-hadoop2.7.tgz
!tar xf spark-2.4.6-bin-hadoop2.7.tgz
!pip install -q findspark

In [10]:
# Setting Java And Spark Home Path
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.6-bin-hadoop2.7"

In [11]:
# Initialize spark session and findspark
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [12]:
# Colab Specific to load data from drive
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [53]:
# Importing all the libraries

from pyspark.ml import feature
from pyspark.ml import regression

In [16]:
# Reading csv in spark and view records
sdf = spark.read.csv('/content/drive/My Drive/CrossValidation_And_HyperParam/train.csv', header=True, inferSchema=True)
sdf.show(5)

+-------------+----+-----------+--------+---+------+----------+-----+---------+-------+---+---------+--------+----+----+----+---------+-------+------------+----+-----------+
|battery_power|blue|clock_speed|dual_sim| fc|four_g|int_memory|m_dep|mobile_wt|n_cores| pc|px_height|px_width| ram|sc_h|sc_w|talk_time|three_g|touch_screen|wifi|price_range|
+-------------+----+-----------+--------+---+------+----------+-----+---------+-------+---+---------+--------+----+----+----+---------+-------+------------+----+-----------+
|          842|   0|        2.2|       0|  1|     0|         7|  0.6|      188|      2|  2|       20|     756|2549|   9|   7|       19|      0|           0|   1|          1|
|         1021|   1|        0.5|       1|  0|     1|        53|  0.7|      136|      3|  6|      905|    1988|2631|  17|   3|        7|      1|           1|   0|          2|
|          563|   1|        0.5|       1|  2|     1|        41|  0.9|      145|      5|  6|     1263|    1716|2603|  11|   2|     

In [17]:
sdf.printSchema()

root
 |-- battery_power: integer (nullable = true)
 |-- blue: integer (nullable = true)
 |-- clock_speed: double (nullable = true)
 |-- dual_sim: integer (nullable = true)
 |-- fc: integer (nullable = true)
 |-- four_g: integer (nullable = true)
 |-- int_memory: integer (nullable = true)
 |-- m_dep: double (nullable = true)
 |-- mobile_wt: integer (nullable = true)
 |-- n_cores: integer (nullable = true)
 |-- pc: integer (nullable = true)
 |-- px_height: integer (nullable = true)
 |-- px_width: integer (nullable = true)
 |-- ram: integer (nullable = true)
 |-- sc_h: integer (nullable = true)
 |-- sc_w: integer (nullable = true)
 |-- talk_time: integer (nullable = true)
 |-- three_g: integer (nullable = true)
 |-- touch_screen: integer (nullable = true)
 |-- wifi: integer (nullable = true)
 |-- price_range: integer (nullable = true)



In [27]:
# Creating array of independent variable as this is how pyspark sxpects
feature_columns = sdf.columns[:-1]
vector_assembler = feature.VectorAssembler(inputCols=feature_columns, outputCol='features')

In [30]:
data_w_features = vector_assembler.transform(sdf)

In [43]:
sdf_final = data_w_features.select('features','price_range')
sdf_final.show(5)

+--------------------+-----------+
|            features|price_range|
+--------------------+-----------+
|[842.0,0.0,2.2,0....|          1|
|[1021.0,1.0,0.5,1...|          2|
|[563.0,1.0,0.5,1....|          2|
|[615.0,1.0,2.5,0....|          2|
|[1821.0,1.0,1.2,0...|          1|
+--------------------+-----------+
only showing top 5 rows



In [46]:
# Spliting the data into train and test split
train_sdf, test_sdf = sdf_final.randomSplit([0.7, 0.3])

In [50]:
# Checking the summary for train dataframe
train_sdf.describe().show()

+-------+-----------------+
|summary|      price_range|
+-------+-----------------+
|  count|             1395|
|   mean|1.488888888888889|
| stddev|1.128278032378756|
|    min|                0|
|    max|                3|
+-------+-----------------+



In [51]:
# Checking the summary for test dataframe
test_sdf.describe().show()

+-------+------------------+
|summary|       price_range|
+-------+------------------+
|  count|               605|
|   mean|1.5256198347107437|
| stddev|1.0954850845478725|
|    min|                 0|
|    max|                 3|
+-------+------------------+



In [57]:
# Instantiate Linear Regression and fitting on the train datafeame
LinReg  = regression.LinearRegression(featuresCol="features", labelCol="price_range")
model = LinReg.fit(train_sdf)

In [58]:
# Generating prediction of test_dataframe
pred = model.evaluate(test_sdf)
pred.predictions.show()

In [61]:
# Printing the model coefficient
model.coefficients

DenseVector([0.0005, -0.0059, -0.0148, -0.0252, 0.0007, 0.0054, 0.0008, 0.0084, -0.001, 0.0046, -0.0012, 0.0003, 0.0003, 0.0009, -0.0007, 0.0013, 0.0013, 0.0048, -0.0269, -0.0075])

In [62]:
# Print the model intercept
model.intercept

-1.5553972803756233

In [64]:
#Evaluate the model using metric like Mean Absolute Error(MAE), Root Mean Square Error(RMSE) and R-Square
from pyspark.ml.evaluation import RegressionEvaluator

evaluation = RegressionEvaluator(labelCol="price_range", predictionCol="prediction")

rmse = evaluation.evaluate(pred.predictions, {evaluation.metricName: "rmse"})
mse = evaluation.evaluate(pred.predictions, {evaluation.metricName: "mse"})
mae = evaluation.evaluate(pred.predictions, {evaluation.metricName: "mae"})
r2 = evaluation.evaluate(pred.predictions, {evaluation.metricName: "r2"})


print("RMSE: %.3f" % rmse)
print("MSE: %.3f" % mse)
print("MAE: %.3f" % mae)
print("r2: %.3f" %r2)


RMSE: 0.321
MSE: 0.103
MAE: 0.265
r2: 0.914
