In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import findspark
findspark.init("/home/rajdeep/spark-3.5.0-bin-hadoop3")

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import corr

In [4]:
# initializing the spark object
spark = SparkSession.builder.appName("cruise").getOrCreate()

23/11/21 15:48:12 WARN Utils: Your hostname, DESKTOP-CSFBOLK resolves to a loopback address: 127.0.1.1; using 172.19.12.103 instead (on interface eth0)
23/11/21 15:48:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/21 15:48:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
#reading the data into dataframe
df = spark.read.csv("data/cruise_ship_info.csv", inferSchema=True, header=True)

                                                                                

In [6]:
#initalizing the stringIndexer function
indexer = StringIndexer(inputCol="Cruise_line", outputCol="Cruise_line_indexed")

In [7]:
#transform the stringColumn to Integer value
indexer_model = indexer.fit(df)
df = indexer_model.transform(df)

                                                                                

In [8]:
df.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_indexed|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|                1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|                1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|                1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.5

In [9]:
#initializing the vectorassmbler to create a vector of all the features
assembler = VectorAssembler(inputCols=['Age', 'Tonnage','passengers', 'length','cabins','passenger_density','Cruise_line_indexed'], outputCol="features")

In [10]:
#adding the vector column
df = assembler.transform(df)

In [11]:
#splitting the data into train test dataset
train_df , test_df = df.randomSplit([0.7,0.3])

In [12]:
#initializing the linear regressor
lr = LinearRegression(featuresCol = 'features', labelCol = 'crew')

In [13]:
#training the regressor model
lr_model = lr.fit(train_df)

23/11/21 15:48:28 WARN Instrumentation: [acfd7912] regParam is zero, which might cause numerical instability and overfitting.
23/11/21 15:48:29 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/11/21 15:48:29 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [14]:
#evaluating the regressor model
test_df_evaluation = lr_model.evaluate(test_df)

In [15]:
#r2 score of the LR model
test_df_evaluation.r2

0.9339140276045034

In [16]:
#rmse score of the LR model
test_df_evaluation.rootMeanSquaredError

0.8447501462733567

In [17]:
#creating new df with only features vector
unlabelled_df = test_df.select('features')

In [18]:
#predicting the value using lr model
unlabelled_df = lr_model.transform(unlabelled_df)

In [19]:
unlabelled_df.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[13.0,85.619,21.1...| 9.625230330688474|
|[13.0,76.0,18.74,...| 8.669005389476522|
|[11.0,90.09,25.01...| 8.938538854192563|
|[18.0,70.60600000...| 7.904913542981781|
|[19.0,16.8,2.96,5...|2.2044856380660987|
|[17.0,101.353,26....|11.026442896496622|
|[21.0,50.76,17.48...| 7.254315271407881|
|[22.0,70.367,20.5...| 8.669684804369586|
|[6.0,113.0,37.82,...|11.664243331881217|
|[13.0,138.0,31.14...|13.412728346575019|
|[5.0,133.5,39.59,...|13.079705479342044|
|[23.0,70.367,20.5...| 8.681667502147828|
|[19.0,70.367,20.5...|  8.66501421962333|
|[21.0,19.093,8.0,...|  3.51521070178524|
|[12.0,108.865,27....|11.038568580715275|
|[17.0,74.137,19.5...| 8.701772878115438|
|[5.0,160.0,36.34,...|15.518199687398504|
|[12.0,91.0,20.32,...|  9.24424435277722|
|[15.0,30.27699999...|3.7744357289072874|
|[10.0,91.62700000...| 9.338264529592433|
+--------------------+------------

In [20]:
# Print the coefficients and intercept for linear regression
print("Coefficients: {} Intercept: {}".format(lr_model.coefficients,lr_model.intercept))

Coefficients: [0.001556861582085266,0.019912791636392996,-0.15324164960003228,0.3287732411449961,0.8365692755028802,0.0029313888317101023,0.026534908869388918] Intercept: -1.0923209635495144


In [21]:
print("RMSE: {}".format(test_df_evaluation.rootMeanSquaredError))
print("MSE: {}".format(test_df_evaluation.meanSquaredError))
print("R2: {}".format(test_df_evaluation.r2))

RMSE: 0.8447501462733567
MSE: 0.7136028096288575
R2: 0.9339140276045034


In [22]:
df.select(corr('crew','passengers')).show()

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+



In [23]:
df.select(corr('crew','cabins')).show()

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+

