# Predict

Hyundai is currently building new ships for some customers and want you to create a model and use it to predict how many crew members the ships will need.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.appName('ProjLinReg').getOrCreate()

In [3]:
df = spark.read.csv("cruise_ship_info.csv", inferSchema = True, header = True)

In [4]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [5]:
df.select('crew').describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              158|
|   mean|7.794177215189873|
| stddev|3.503486564627034|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



Linear Regression Model Starts
We break the process in following parts:
- load libraries
- Feature Engineering
- Feature vectorization
- Feature selection
- test, train
- model
- model train
- model evaluate

In [6]:
# load libraries
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder

In [7]:
# Feature Engineering
df.printSchema()
# features: Cruise_line, Age, Tonnage, passengers, length, cabins, passenger_density
# Cruise_line is string; rest are double

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [8]:
# Feature Engineering, contd.
data_cruise_line = df.select('Cruise_line').distinct()
# print (data_cruise_line.count()) = 20
# print (data_cruise_line.show())
temp_groupedData = df.groupBy('Cruise_line')
temp_groupedData.agg(F.count('Cruise_line')).show()

+-----------------+------------------+
|      Cruise_line|count(Cruise_line)|
+-----------------+------------------+
|            Costa|                11|
|              P&O|                 6|
|           Cunard|                 3|
|Regent_Seven_Seas|                 5|
|              MSC|                 8|
|         Carnival|                22|
|          Crystal|                 2|
|           Orient|                 1|
|         Princess|                17|
|        Silversea|                 4|
|         Seabourn|                 3|
| Holland_American|                14|
|         Windstar|                 3|
|           Disney|                 2|
|        Norwegian|                13|
|          Oceania|                 3|
|          Azamara|                 2|
|        Celebrity|                10|
|             Star|                 6|
|  Royal_Caribbean|                23|
+-----------------+------------------+



In [9]:
# Feature Engineering, contd.
indexer = StringIndexer(inputCol = "Cruise_line", outputCol = 'Cruise_line_label')
output_1 = indexer.fit(df).transform(df)
output_1.show()

StringIndexer_18a5dab59d44

In [10]:
output_1.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Cruise_line_label']

In [11]:
# one hot encoding (not included in the learning process)
encoder = OneHotEncoder(inputCol='Cruise_line_label',outputCol='Cruise_line_enc')
output_2 = encoder.transform(output_1)

In [13]:
output_2.select('Cruise_line_enc').head()[0]

SparseVector(19, {16: 1.0})

In [25]:
assembler = VectorAssembler(
    inputCols=['Age','Tonnage','passengers','length','cabins','passenger_density', 'Cruise_line_enc'],
    outputCol = 'features'
)
final_output_2 = assembler.transform(output_2)

Row(Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, Cruise_line_label=16.0)
Row(Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, Cruise_line_enc=SparseVector(19, {16: 1.0}))
Row(features=SparseVector(25, {0: 6.0, 1: 30.277, 2: 6.94, 3: 5.94, 4: 3.55, 5: 42.64, 22: 1.0}))


In [26]:
# Feature vectors - 1
assembler = VectorAssembler(
    inputCols=['Age','Tonnage','passengers','length','cabins','passenger_density', 'Cruise_line_label'],
    outputCol = 'features'
)
final_output = assembler.transform(output_1)

In [30]:
print ("Cataegorical encoding")
print (output_1.select('Age','Tonnage','passengers','length','cabins','passenger_density', 'Cruise_line_label').head())
print ("\nOne hot encoding")
print (output_2.select('Age','Tonnage','passengers','length','cabins','passenger_density', 'Cruise_line_enc').head())
print ("\nAssembler Cataegorical")
print (final_output.select('features').head())
print ("\nAssembler one hot encoding")
print (final_output_2.select('features').head())

Cataegorical encoding
Row(Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, Cruise_line_label=16.0)

One hot encoding
Row(Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, Cruise_line_enc=SparseVector(19, {16: 1.0}))

Assembler Cataegorical
Row(features=DenseVector([6.0, 30.277, 6.94, 5.94, 3.55, 42.64, 16.0]))

Assembler one hot encoding
Row(features=SparseVector(25, {0: 6.0, 1: 30.277, 2: 6.94, 3: 5.94, 4: 3.55, 5: 42.64, 22: 1.0}))


In [31]:
# Feature vectors - 2
final_output_2.select('features').head()

Row(features=SparseVector(25, {0: 6.0, 1: 30.277, 2: 6.94, 3: 5.94, 4: 3.55, 5: 42.64, 22: 1.0}))

In [33]:
# Feature vectors - 3
final_data = final_output_2.select("features", "crew")

In [34]:
# test, train
train, test = final_data.randomSplit([0.7,0.3])

In [35]:
train.count(), test.count()

(112, 46)

In [36]:
# model
lr = LinearRegression(featuresCol='features', labelCol='crew')

In [37]:
# model train
lrModel = lr.fit(train)
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [-0.0013318984820761253,0.017044383830641628,-0.021544742574925984,0.4239633934742099,0.5254432791681632,0.004557647306912949,-0.9236925701369246,0.6742092819225582,0.2807514769826121,-0.25439008251375933,0.5690094428527649,0.0340753348935867,0.9310981811401231,-0.14435982168529504,0.5565937990440182,0.13475628408144408,0.4921194091670434,0.11838090764074259,0.5720790740053633,0.31803056649936434,-0.2613327475557143,0.14301075697046042,0.05215456474110098,0.0,0.2624680601047962] Intercept: -1.4388193143543762


In [38]:
# model evaluate
test_results = lrModel.evaluate(test)

In [39]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))
print("R2: {}".format(test_results.r2))

RMSE: 0.7918787384176074
MSE: 0.6270719363578615
R2: 0.9547474727921619


Looking at the Data

In [40]:
df.agg(F.corr('crew','cabins')).show()

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+

