Hyundai wants to predict the crew numbers will be needed in the future cruise ships they want to produce with certain specs. In this case, I use Linear Regression on the past data of different ships specs and its crew numbers.

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('ship').getOrCreate()

In [3]:
data = spark.read.csv('cruise_ship_info.csv', header=True, inferSchema='True')

In [4]:
for row in data.head(3):
    print('\n')
    print(row)



Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)


Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)


Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262, passengers=14.86, length=7.22, cabins=7.43, passenger_density=31.8, crew=6.7)


In [5]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



Description: Measurements of ship size, capacity, crew, and age for 158 cruise
ships.


Variables/Columns
Ship Name     1-20
Cruise Line   21-40
Age (as of 2013)   46-48
Tonnage (1000s of tons)   50-56
passengers (100s)   58-64
Length (100s of feet)  66-72
Cabins  (100s)   74-80
Passenger Density   82-88
Crew  (100s)   90-96

In [10]:
data.head()

Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)

In [7]:
for item in data.head():
    print (item)
    

Journey
Azamara
6
30.276999999999997
6.94
5.94
3.55
42.64
3.55


In [9]:
data.describe().show()

+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|summary|Ship_name|Cruise_line|               Age|           Tonnage|       passengers|           length|            cabins|passenger_density|             crew|
+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|  count|      158|        158|               158|               158|              158|              158|               158|              158|              158|
|   mean| Infinity|       null|15.689873417721518| 71.28467088607599|18.45740506329114|8.130632911392404| 8.830000000000005|39.90094936708861|7.794177215189873|
| stddev|      NaN|       null| 7.615691058751413|37.229540025907866|9.677094775143416|1.793473548054825|4.4714172221480615| 8.63921711391542|3.503486564627034|
|    min|Adventure|    Azamara|   

In [11]:
from pyspark.sql.functions import max,min
from pyspark.sql.functions import format_number

In [12]:
data.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [13]:
data_quantitative= data.select(
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'crew')

In [14]:
data_quantitative.describe().show()

+-------+------------------+-----------------+-----------------+------------------+-----------------+
|summary|           Tonnage|       passengers|           length|            cabins|             crew|
+-------+------------------+-----------------+-----------------+------------------+-----------------+
|  count|               158|              158|              158|               158|              158|
|   mean| 71.28467088607599|18.45740506329114|8.130632911392404| 8.830000000000005|7.794177215189873|
| stddev|37.229540025907866|9.677094775143416|1.793473548054825|4.4714172221480615|3.503486564627034|
|    min|             2.329|             0.66|             2.79|              0.33|             0.59|
|    max|             220.0|             54.0|            11.82|              27.0|             21.0|
+-------+------------------+-----------------+-----------------+------------------+-----------------+



In [15]:
data.filter(data['Tonnage']<50).count()

44

In [16]:
data.filter((data['Tonnage']>=50 )& (data['Tonnage']<=100)).count()

82

In [17]:
data.filter(data['Tonnage']>100).count()

32

In [56]:
data.filter(data['cabins']<=5.00).count()

33

In [60]:
data.filter((data['cabins']>5.00 ) & (data['cabins']<=10.00)).count()

63

In [58]:
data.filter(data['cabins']>10.00).count()

62

from pyspark.sql.functions import stddev #unresolved
cl_sd=data.groupBy("Cruise_line").agg(stddev('Tonnage').alias('Tonnage'))
cl_sd2=cl_sd.select('Cruise_line', format_number('Tonnage',2).alias('Tonnage'))
cl_mean.join(cl_sd2, Cruise_line=Cruise_line)

In [19]:
data.na.drop().count()

158

In [20]:
datac=data.na.drop()

In [30]:
datac.dropDuplicates().count()

158

In [21]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

In [None]:
# Attention! outputCol should be exactly 'features' with small f. Otherwise the fit  method won't work

In [22]:
assembler = VectorAssembler(inputCols=[ 
                                        'Tonnage',
                                         'passengers',
                                         'length',
                                         'cabins',
                                         'passenger_density',
                                         'crew'],
                            outputCol='features') #small f

In [23]:
output=assembler.transform(datac)

In [24]:
output.select('features').show(5)

+--------------------+
|            features|
+--------------------+
|[30.2769999999999...|
|[30.2769999999999...|
|[47.262,14.86,7.2...|
|[110.0,29.74,9.53...|
|[101.353,26.42,8....|
+--------------------+
only showing top 5 rows



In [25]:
prepared_data=output.select("features",'crew')

In [26]:
prepared_data.show(5)

+--------------------+----+
|            features|crew|
+--------------------+----+
|[30.2769999999999...|3.55|
|[30.2769999999999...|3.55|
|[47.262,14.86,7.2...| 6.7|
|[110.0,29.74,9.53...|19.1|
|[101.353,26.42,8....|10.0|
+--------------------+----+
only showing top 5 rows



In [27]:
trainD,testD=prepared_data.randomSplit([0.7,0.3])

In [28]:
trainD.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              107|
|   mean|7.987943925233647|
| stddev|3.307928080461424|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



In [29]:
testD.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                51|
|   mean|  7.38764705882353|
| stddev|3.8852143252259816|
|    min|              0.59|
|    max|              19.1|
+-------+------------------+



In [30]:
from pyspark.ml.regression import LinearRegression

In [31]:
# Create a Linear Regression Model object
lr = LinearRegression(labelCol='crew')

In [32]:
# Fit the model to the train data and call this model lrModel
lrModel = lr.fit(trainD)

In [34]:
# Print the coefficients and intercept for linear regression
print("Coefficients:{}", "Intercept:{}".format(lrModel.coefficients, lrModel.intercept))

Coefficients:{} Intercept:[1.232535941970415e-15,-4.891574761936937e-16,-2.769408917566102e-14,-1.9762901078887148e-14,-2.389899693503025e-15,1.0000000000000262]


In [35]:
test_result=lrModel.evaluate(testD)

In [39]:
test_result.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|-3.01980662698042...|
|-3.15303338993544...|
|-1.17683640610266...|
|-5.35127497869325...|
|2.065014825802791...|
|1.687538997430238...|
|3.819167204710538...|
|3.774758283725532...|
|-1.82076576038525...|
|-1.68753899743023...|
|-9.76996261670137...|
|-1.02140518265514...|
|-6.21724893790087...|
|6.217248937900877...|
|-6.75015598972095...|
|-2.75335310107038...|
|-1.42108547152020...|
|3.552713678800501...|
|-7.99360577730112...|
|-1.77635683940025...|
+--------------------+
only showing top 20 rows



In [40]:
unlabeled_data=testD.select('features')

In [41]:
predictions=lrModel.transform(unlabeled_data)

In [42]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[3.341,0.66,2.79,...|0.5900000000000302|
|[5.35,1.67,4.4,0....|0.8800000000000315|
|[10.0,2.08,4.4,1....|1.6000000000000119|
|[12.5,3.94,4.36,0...|1.4600000000000535|
|[16.8,2.96,5.14,1...|1.9699999999999793|
|[16.8,2.96,5.14,1...| 2.099999999999983|
|[25.0,3.82,5.97,1...| 2.949999999999962|
|[25.0,3.88,5.97,1...|2.8699999999999624|
|[25.0,7.76,6.22,3...|3.8500000000000183|
|[30.2769999999999...| 4.000000000000017|
|[30.2769999999999...|3.7300000000000098|
|[30.2769999999999...|  3.73000000000001|
|[30.2769999999999...| 3.550000000000006|
|[38.0,7.49,6.74,3...| 4.599999999999993|
|[40.0530000000000...|7.5000000000000675|
|[42.0,14.8,7.13,7...| 6.800000000000027|
|[42.0,15.04,7.08,...| 6.300000000000014|
|[45.0,11.78,7.54,...| 5.199999999999997|
|[46.0,7.0,6.7,1.8...| 4.470000000000008|
|[46.052,14.52,7.2...| 6.600000000000017|
+--------------------+------------

In [44]:
print("RMSE: {}".format(test_result.rootMeanSquaredError))
print("MSE: {}".format(test_result.meanSquaredError))

RMSE: 3.8980613315179705e-14
MSE: 1.519488214427565e-27
