In [0]:
# OVERVIEW
# predictive model for how many crew members a ship will require
# features: ship name, cruise line, age, tonnage, passengers, length, cabins, passenger density, crew

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cruise').getOrCreate()
df = spark.read.csv('/FileStore/shared_uploads/sejal@ibm.com/cruise_ship_info.csv', inferSchema=True, header=True)

In [0]:
df.printSchema()

In [0]:
display(df)

Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew
Journey,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55
Quest,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55
Celebration,Carnival,26,47.262,14.86,7.22,7.43,31.8,6.7
Conquest,Carnival,11,110.0,29.74,9.53,14.88,36.99,19.1
Destiny,Carnival,17,101.353,26.42,8.92,13.21,38.36,10.0
Ecstasy,Carnival,22,70.367,20.52,8.55,10.2,34.29,9.2
Elation,Carnival,15,70.367,20.52,8.55,10.2,34.29,9.2
Fantasy,Carnival,23,70.367,20.56,8.55,10.22,34.23,9.2
Fascination,Carnival,19,70.367,20.52,8.55,10.2,34.29,9.2
Freedom,Carnival,6,110.239,37.0,9.51,14.87,29.79,11.5


In [0]:
df.groupBy('Cruise_line').count().orderBy('count', ascending=False).show()

In [0]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Cruise_line', outputCol='cruise_cat')
indexed = indexer.fit(df).transform(df)

In [0]:
display(indexed)

Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew,cruise_cat
Journey,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55,16.0
Quest,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55,16.0
Celebration,Carnival,26,47.262,14.86,7.22,7.43,31.8,6.7,1.0
Conquest,Carnival,11,110.0,29.74,9.53,14.88,36.99,19.1,1.0
Destiny,Carnival,17,101.353,26.42,8.92,13.21,38.36,10.0,1.0
Ecstasy,Carnival,22,70.367,20.52,8.55,10.2,34.29,9.2,1.0
Elation,Carnival,15,70.367,20.52,8.55,10.2,34.29,9.2,1.0
Fantasy,Carnival,23,70.367,20.56,8.55,10.22,34.23,9.2,1.0
Fascination,Carnival,19,70.367,20.52,8.55,10.2,34.29,9.2,1.0
Freedom,Carnival,6,110.239,37.0,9.51,14.87,29.79,11.5,1.0


In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
indexed.columns

In [0]:
assembler = VectorAssembler(inputCols=
  ['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'cruise_cat'], outputCol = 'features')

In [0]:
output = assembler.transform(indexed)

In [0]:
output.select('features', 'crew').show()

In [0]:
final_data = output.select('features', 'crew')

In [0]:
train_data, test_data = final_data.randomSplit([0.7, 0.30])

In [0]:
train_data.describe().show()

In [0]:
test_data.describe().show()

In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
ship_lr = LinearRegression(labelCol='crew')
trained_ship_model = ship_lr.fit(train_data)
ship_results = trained_ship_model.evaluate(test_data)

In [0]:
ship_results.rootMeanSquaredError

In [0]:
ship_results.r2

In [0]:
ship_results.meanSquaredError

In [0]:
ship_results.meanAbsoluteError

In [0]:
from pyspark.sql.functions import corr

In [0]:
df.select(corr('crew', 'passengers')).show()

In [0]:
df.select(corr('crew', 'cabins')).show()

In [0]:
df.select(corr('crew', 'tonnage')).show()