# Predicting Crew Members

Hyundai is currently building new ships for some customers and want you to create a model and use it to predict how many crew members the ships will need.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.appName('ProjLinReg').getOrCreate()

In [3]:
df = spark.read.csv("cruise_ship_info.csv", inferSchema = True, header = True)

In [4]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [5]:
df.select('crew').describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              158|
|   mean|7.794177215189873|
| stddev|3.503486564627034|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



Linear Regression Model Starts
We break the process in following parts:
- load libraries
- Feature Engineering
- Feature vectorization
- Feature selection
- test, train
- model
- model train
- model evaluate

In [6]:
# load libraries
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder

In [7]:
# Feature Engineering
df.printSchema()
# features: Cruise_line, Age, Tonnage, passengers, length, cabins, passenger_density
# Cruise_line is string; rest are double

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [8]:
# Feature Engineering, contd.
data_cruise_line = df.select('Cruise_line').distinct()
# print (data_cruise_line.count()) = 20
# print (data_cruise_line.show())
temp_groupedData = df.groupBy('Cruise_line')
temp_groupedData.agg(F.count('Cruise_line')).show()

+-----------------+------------------+
|      Cruise_line|count(Cruise_line)|
+-----------------+------------------+
|            Costa|                11|
|              P&O|                 6|
|           Cunard|                 3|
|Regent_Seven_Seas|                 5|
|              MSC|                 8|
|         Carnival|                22|
|          Crystal|                 2|
|           Orient|                 1|
|         Princess|                17|
|        Silversea|                 4|
|         Seabourn|                 3|
| Holland_American|                14|
|         Windstar|                 3|
|           Disney|                 2|
|        Norwegian|                13|
|          Oceania|                 3|
|          Azamara|                 2|
|        Celebrity|                10|
|             Star|                 6|
|  Royal_Caribbean|                23|
+-----------------+------------------+



In [9]:
# Feature Engineering, contd.
indexer = StringIndexer(inputCol = "Cruise_line", outputCol = 'Cruise_line_label')
output_1 = indexer.fit(df).transform(df)
output_1.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-----------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_label|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-----------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|             16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|             16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|              1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|              1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|              1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|       

In [10]:
output_1.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Cruise_line_label']

In [11]:
# one hot encoding (not included in the learning process)
encoder = OneHotEncoder(inputCol='Cruise_line_label',outputCol='Cruise_line_enc')
output_2 = encoder.transform(output_1)

In [12]:
output_2.select('Cruise_line_enc').head()[0]

SparseVector(19, {16: 1.0})

In [13]:
assembler = VectorAssembler(
    inputCols=['Age','Tonnage','passengers','length','cabins','passenger_density', 'Cruise_line_enc'],
    outputCol = 'features'
)
final_output_2 = assembler.transform(output_2)

In [14]:
# Feature vectors - 1
assembler = VectorAssembler(
    inputCols=['Age','Tonnage','passengers','length','cabins','passenger_density', 'Cruise_line_label'],
    outputCol = 'features'
)
final_output = assembler.transform(output_1)

In [15]:
print ("Cataegorical encoding")
print (output_1.select('Age','Tonnage','passengers','length','cabins','passenger_density', 'Cruise_line_label').head())
print ("\nOne hot encoding")
print (output_2.select('Age','Tonnage','passengers','length','cabins','passenger_density', 'Cruise_line_enc').head())
print ("\nAssembler Cataegorical")
print (final_output.select('features').head())
print ("\nAssembler one hot encoding")
print (final_output_2.select('features').head())

Cataegorical encoding
Row(Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, Cruise_line_label=16.0)

One hot encoding
Row(Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, Cruise_line_enc=SparseVector(19, {16: 1.0}))

Assembler Cataegorical
Row(features=DenseVector([6.0, 30.277, 6.94, 5.94, 3.55, 42.64, 16.0]))

Assembler one hot encoding
Row(features=SparseVector(25, {0: 6.0, 1: 30.277, 2: 6.94, 3: 5.94, 4: 3.55, 5: 42.64, 22: 1.0}))


In [16]:
# Feature vectors - 2
final_output_2.select('features').head()

Row(features=SparseVector(25, {0: 6.0, 1: 30.277, 2: 6.94, 3: 5.94, 4: 3.55, 5: 42.64, 22: 1.0}))

In [17]:
# Feature vectors - 3
final_data = final_output_2.select("features", "crew")

In [18]:
# test, train
train, test = final_data.randomSplit([0.7,0.3])

In [19]:
train.count(), test.count()

(115, 43)

In [20]:
# model
lr = LinearRegression(featuresCol='features', labelCol='crew')

In [21]:
# model train
lrModel = lr.fit(train)
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [0.00484507852689659,0.00849961582279936,-0.0660214950709557,0.4809660965262299,0.6661892456699454,0.01345859263531389,-0.7182246727047528,0.5810496962743971,0.3680790777303016,-0.08180482979793215,0.7264037465386671,0.03454582271057268,0.878381371433544,0.27505459133018784,1.2938317039677167,0.38791227717352816,0.7745872322999914,0.15101569399799328,0.7365190870516255,0.4258041432886752,-0.2199262812807872,0.5690081890579585,0.27192526143290474,1.063726001511942,0.34650469291503594] Intercept: -2.3459342505326437


In [22]:
# model evaluate
test_results = lrModel.evaluate(test)

In [23]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))
print("R2: {}".format(test_results.r2))

RMSE: 1.4195855620976592
MSE: 2.015223168116127
R2: 0.8831875279943283


Looking at the Data

In [24]:
df.agg(F.corr('crew','cabins')).show()

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+

