Linear Regression Project

In [1]:
# !pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=1ddd2d30c4b93553114491273fd59bd77b9a89226992ef5b5102a086436d1ff5
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
from pyspark.ml.feature import VarianceThresholdSelector
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [36]:
spark = SparkSession.builder.appName('lr_project').getOrCreate()

In [37]:
# reading csv file
data = spark.read.csv("/content/drive/MyDrive/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Linear_Regression/cruise_ship_info.csv",inferSchema=True,header=True)

In [38]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [39]:
data.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [40]:
for item in data.head():
    print(item)

Journey
Azamara
6
30.276999999999997
6.94
5.94
3.55
42.64
3.55


### Spark accepts data in the form of two columns - ("label", "features")

In [41]:
data.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

#### Investigating label column 'crew'

In [42]:
data.select('crew').distinct().show()

+-----+
| crew|
+-----+
| 6.96|
|  8.0|
| 6.17|
| 6.44|
|  7.0|
| 8.48|
| 11.6|
| 2.97|
|  6.6|
| 5.45|
| 5.88|
| 10.3|
| 9.99|
|  6.7|
| 7.65|
|10.29|
| 5.35|
| 6.71|
| 1.97|
| 9.87|
+-----+
only showing top 20 rows



In [43]:
data.select('crew').summary().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              158|
|   mean|7.794177215189873|
| stddev|3.503486564627034|
|    min|             0.59|
|    25%|             5.45|
|    50%|             8.08|
|    75%|             9.99|
|    max|             21.0|
+-------+-----------------+



### To transform **string** categorical features ("Ship_name","Cruise_line") into **numerical** information, I used the StringIndexer

In [44]:
categorial_cols = [x for x, y in data.dtypes if y == 'string']
print(categorial_cols)
index_output_cols = [x + "Index" for x in categorial_cols]
ohe_output_cols  = [x + "OHE" for x in categorial_cols ]

['Ship_name', 'Cruise_line']


In [45]:
indexer = StringIndexer(inputCols=categorial_cols, outputCols=index_output_cols)
indexed = indexer.fit(data).transform(data)
indexed.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------+----------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Ship_nameIndex|Cruise_lineIndex|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------+----------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|          64.0|            16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|          98.0|            16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|          27.0|             1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|          31.0|             1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0| 

In [46]:
encoder = OneHotEncoder(inputCols=index_output_cols ,outputCols=ohe_output_cols)
encoded = encoder.fit(indexed).transform(indexed)
encoded.show(truncate=False)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------+----------------+-----------------+---------------+
|Ship_name  |Cruise_line|Age|Tonnage           |passengers|length|cabins|passenger_density|crew|Ship_nameIndex|Cruise_lineIndex|Ship_nameOHE     |Cruise_lineOHE |
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------+----------------+-----------------+---------------+
|Journey    |Azamara    |6  |30.276999999999997|6.94      |5.94  |3.55  |42.64            |3.55|64.0          |16.0            |(137,[64],[1.0]) |(19,[16],[1.0])|
|Quest      |Azamara    |6  |30.276999999999997|6.94      |5.94  |3.55  |42.64            |3.55|98.0          |16.0            |(137,[98],[1.0]) |(19,[16],[1.0])|
|Celebration|Carnival   |26 |47.262            |14.86     |7.22  |7.43  |31.8             |6.7 |27.0          |1.0             |(137,[27],[1.0]) |(19,[1],[1.0]) |
|Conquest   |Carnival 

In [47]:
numeric_cols = [x for (x,y) in data.dtypes if (((y== 'int') or (y=='double')) & (x != 'crew'))]
vec_assembler_input = ohe_output_cols + numeric_cols
vec_assembler = VectorAssembler(inputCols = vec_assembler_input, outputCol = 'features')

In [48]:
output = vec_assembler.transform(encoded)

In [49]:
output.select("features").show(truncate=False)

+--------------------------------------------------------------------------------------------+
|features                                                                                    |
+--------------------------------------------------------------------------------------------+
|(162,[64,153,156,157,158,159,160,161],[1.0,1.0,6.0,30.276999999999997,6.94,5.94,3.55,42.64])|
|(162,[98,153,156,157,158,159,160,161],[1.0,1.0,6.0,30.276999999999997,6.94,5.94,3.55,42.64])|
|(162,[27,138,156,157,158,159,160,161],[1.0,1.0,26.0,47.262,14.86,7.22,7.43,31.8])           |
|(162,[31,138,156,157,158,159,160,161],[1.0,1.0,11.0,110.0,29.74,9.53,14.88,36.99])          |
|(162,[34,138,156,157,158,159,160,161],[1.0,1.0,17.0,101.353,26.42,8.92,13.21,38.36])        |
|(162,[37,138,156,157,158,159,160,161],[1.0,1.0,22.0,70.367,20.52,8.55,10.2,34.29])          |
|(162,[38,138,156,157,158,159,160,161],[1.0,1.0,15.0,70.367,20.52,8.55,10.2,34.29])          |
|(162,[47,138,156,157,158,159,160,161],[1.0,1.0,23

In [50]:
output.show(truncate=False)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------+----------------+-----------------+---------------+--------------------------------------------------------------------------------------------+
|Ship_name  |Cruise_line|Age|Tonnage           |passengers|length|cabins|passenger_density|crew|Ship_nameIndex|Cruise_lineIndex|Ship_nameOHE     |Cruise_lineOHE |features                                                                                    |
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------+----------------+-----------------+---------------+--------------------------------------------------------------------------------------------+
|Journey    |Azamara    |6  |30.276999999999997|6.94      |5.94  |3.55  |42.64            |3.55|64.0          |16.0            |(137,[64],[1.0]) |(19,[16],[1.0])|(162,[64,153,156,157,158,159,160,161],[1.0,1.0,6.0,30.276999999999997,

In [51]:
final_data = output.select("features",'crew')

In [52]:
final_data.count()

158

### Removing low-variance features from dataset

In [53]:
selector = VarianceThresholdSelector(varianceThreshold=0.5, outputCol="selectedFeatures")

result = selector.fit(final_data).transform(final_data)

print("Features selected by VarianceThresholdSelector:")
result.show(truncate=False)

Features selected by VarianceThresholdSelector:
+--------------------------------------------------------------------------------------------+----+----------------------------------------------------------------+
|features                                                                                    |crew|selectedFeatures                                                |
+--------------------------------------------------------------------------------------------+----+----------------------------------------------------------------+
|(162,[64,153,156,157,158,159,160,161],[1.0,1.0,6.0,30.276999999999997,6.94,5.94,3.55,42.64])|3.55|(6,[0,1,2,3,4,5],[6.0,30.276999999999997,6.94,5.94,3.55,42.64]) |
|(162,[98,153,156,157,158,159,160,161],[1.0,1.0,6.0,30.276999999999997,6.94,5.94,3.55,42.64])|3.55|(6,[0,1,2,3,4,5],[6.0,30.276999999999997,6.94,5.94,3.55,42.64]) |
|(162,[27,138,156,157,158,159,160,161],[1.0,1.0,26.0,47.262,14.86,7.22,7.43,31.8])           |6.7 |(6,[0,1,2,3,4,5],[26.0,47.26

In [54]:
final_data.show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|(162,[64,153,156,...|3.55|
|(162,[98,153,156,...|3.55|
|(162,[27,138,156,...| 6.7|
|(162,[31,138,156,...|19.1|
|(162,[34,138,156,...|10.0|
|(162,[37,138,156,...| 9.2|
|(162,[38,138,156,...| 9.2|
|(162,[47,138,156,...| 9.2|
|(162,[48,138,156,...| 9.2|
|(162,[5,138,156,1...|11.5|
|(162,[53,138,156,...|11.6|
|(162,[57,138,156,...| 6.6|
|(162,[58,138,156,...| 9.2|
|(162,[62,138,156,...| 9.2|
|(162,[1,138,156,1...| 9.3|
|(162,[66,138,156,...|11.6|
|(162,[78,138,156,...|10.3|
|(162,[89,138,156,...| 9.2|
|(162,[10,138,156,...| 9.3|
|(162,[107,138,156...| 9.2|
+--------------------+----+
only showing top 20 rows



In [55]:
result_final = result.select("selectedFeatures",'crew')

In [56]:
train_data,test_data = result_final.randomSplit([0.7,0.3])

In [57]:
train_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              107|
|   mean|8.032523364485991|
| stddev|3.504858109112978|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



In [58]:
# Creating a Linear Regression Model object
lr = LinearRegression(featuresCol = 'selectedFeatures', labelCol='crew', predictionCol='prediction')


In [59]:
# Fitting the model to the data and call this model lrModel
lrModel = lr.fit(train_data)

In [60]:
# Printing the coefficients and intercept for linear regression
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [-0.020162477367347817,0.016521910847773016,-0.16742230323612406,0.32305991961231256,0.8315104574656951,-0.012482687800220339] Intercept: 0.5161434513231222


In [61]:
coefficients = lrModel.coefficients
feature_importance = sorted(list(zip(data.columns[:-1], map(abs, coefficients))), key=lambda x: x[1], reverse=True)

print("Feature Importance:")
for feature, importance in feature_importance:
    print("  {}: {:.3f}".format(feature, importance))

Feature Importance:
  passengers: 0.832
  Tonnage: 0.323
  Age: 0.167
  Ship_name: 0.020
  Cruise_line: 0.017
  length: 0.012


In [62]:
test_results = lrModel.evaluate(test_data)

In [63]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
| -1.3253604097824567|
|   0.415287945001495|
|  0.9577163983619581|
| -0.5682402436948717|
| 0.49900022912656716|
|  0.6376249721733025|
|  0.9931390423264919|
|  1.7098164673680838|
| 0.39058140255521145|
|-0.26531500958032694|
| -0.4841160073391393|
| -0.7860072822254089|
|   0.893987929395335|
| -1.1994456539827656|
| -0.5764123524600032|
| 0.20740017706154212|
| 0.17926339743915154|
|  0.7952828875274172|
|  0.7380030317232844|
| -1.1333980531903496|
+--------------------+
only showing top 20 rows



In [64]:
predictions = lrModel.transform(test_data)
predictions.show()

+--------------------+-----+------------------+
|    selectedFeatures| crew|        prediction|
+--------------------+-----+------------------+
|(6,[0,1,2,3,4,5],...|  8.0| 9.325360409782457|
|(6,[0,1,2,3,4,5],...|13.13|12.714712054998506|
|(6,[0,1,2,3,4,5],...| 11.0|10.042283601638042|
|(6,[0,1,2,3,4,5],...| 11.6|12.168240243694871|
|(6,[0,1,2,3,4,5],...| 10.0| 9.500999770873433|
|(6,[0,1,2,3,4,5],...| 10.3| 9.662375027826698|
|(6,[0,1,2,3,4,5],...|12.38|11.386860957673509|
|(6,[0,1,2,3,4,5],...| 4.47| 2.760183532631916|
|(6,[0,1,2,3,4,5],...| 6.36| 5.969418597444789|
|(6,[0,1,2,3,4,5],...|  9.0| 9.265315009580327|
|(6,[0,1,2,3,4,5],...|10.68|11.164116007339139|
|(6,[0,1,2,3,4,5],...|  8.0| 8.786007282225409|
|(6,[0,1,2,3,4,5],...| 12.0|11.106012070604665|
|(6,[0,1,2,3,4,5],...|11.85|13.049445653982765|
|(6,[0,1,2,3,4,5],...|  0.6|1.1764123524600032|
|(6,[0,1,2,3,4,5],...|  6.8| 6.592599822938458|
|(6,[0,1,2,3,4,5],...| 4.45| 4.270736602560849|
|(6,[0,1,2,3,4,5],...| 9.59| 8.794717112

In [65]:
r2 = RegressionEvaluator(predictionCol="prediction", labelCol="crew", metricName="r2").evaluate(predictions)

### Evaluation of model performance on test data

In [66]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))
print("Coefficient of Determination (R2):", r2)

RMSE: 0.8362918060549207
MSE: 0.6993839848746012
Coefficient of Determination (R2): 0.9411550158448562



#### **RMSE** is the square root of Mean Squared Error (MSE) and measures the standard deviation of residuals.


---


#### **MSE** represents the average of the squared difference between the observed and predicted values in the data set - variance of the residuals.


---

#### **R-squared** is the proportion of the variance in the dependent variable which is **explained by the linear regression model**.

---