# Linear Regression-Cruise Ship

There is a csv file for you called "cruise_ship_info.csv". 

Task: Create a regression model that will help predict how many crew members will be needed for future ships. Then test it.

In [3]:
#import SparkSession & start a SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Hyundai').getOrCreate()

In [31]:
# Use Spark to read in the Cruise Ship info csv file.
data = spark.read.csv("spark_master/Spark_for_Machine_Learning/Linear_Regression/cruise_ship_info.csv",inferSchema=True,header=True)

In [32]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [8]:
#Show the first 5 rows
for ship in data.head(5):
    print(ship)
    print('\n')

Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)


Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)


Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262, passengers=14.86, length=7.22, cabins=7.43, passenger_density=31.8, crew=6.7)


Row(Ship_name='Conquest', Cruise_line='Carnival', Age=11, Tonnage=110.0, passengers=29.74, length=9.53, cabins=14.88, passenger_density=36.99, crew=19.1)


Row(Ship_name='Destiny', Cruise_line='Carnival', Age=17, Tonnage=101.353, passengers=26.42, length=8.92, cabins=13.21, passenger_density=38.36, crew=10.0)




In [24]:
#Examine the Cruise_line variable
count = data.groupBy('Cruise_line').count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



In [27]:
# Import StringIndexer, OneHotEncoder
from pyspark.ml.feature import StringIndexer

In [29]:
# Create a StringIndexer
indexer = StringIndexer(inputCol="Cruise_line", outputCol="Cruise_cat")
indexed = indexer.fit(data).transform(data)
indexed.head(3)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, Cruise_cat=16.0),
 Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, Cruise_cat=16.0),
 Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262, passengers=14.86, length=7.22, cabins=7.43, passenger_density=31.8, crew=6.7, Cruise_cat=1.0)]

In [30]:
#View columns of indexed DataFrame
indexed.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Cruise_cat']

In [34]:
# A few things we need to do before Spark can accept the data!
# It needs to be in the form of two columns
# ("label","features")

# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [35]:
assembler = VectorAssembler(
    inputCols=[ 'Age','Tonnage','passengers','length','cabins','passenger_density','Cruise_cat'],
    outputCol="features")

In [36]:
output = assembler.transform(indexed)

In [37]:
#View features & dependent variable (crew)
output.select("features",'crew').show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
|[22.0,70.367,20.5...| 9.2|
|[15.0,70.367,20.5...| 9.2|
|[23.0,70.367,20.5...| 9.2|
|[19.0,70.367,20.5...| 9.2|
|[6.0,110.23899999...|11.5|
|[10.0,110.0,29.74...|11.6|
|[28.0,46.052,14.5...| 6.6|
|[18.0,70.367,20.5...| 9.2|
|[17.0,70.367,20.5...| 9.2|
|[11.0,86.0,21.24,...| 9.3|
|[8.0,110.0,29.74,...|11.6|
|[9.0,88.5,21.24,9...|10.3|
|[15.0,70.367,20.5...| 9.2|
|[12.0,88.5,21.24,...| 9.3|
|[20.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



In [38]:
#Attach the dataframe to an object
final_data = output.select("features",'crew')

In [39]:
#Split the data into train & test
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [40]:
#View train data
train_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              107|
|   mean|8.140934579439264|
| stddev|3.555132227647397|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



In [41]:
#View test data
test_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                51|
|   mean|7.0666666666666655|
| stddev|3.3090625057056067|
|    min|              0.59|
|    max|              13.6|
+-------+------------------+



In [None]:
#Import LinearRegression
from pyspark.ml.regression import LinearRegression

In [42]:
# Create a Linear Regression Model object
cruise_lr = LinearRegression(labelCol='crew')

In [43]:
# Fit the model to the data and call this model cruise_lrModel
cruise_lrModel = cruise_lr.fit(train_data,)

In [44]:
#Test the model on test data
cruise_results = cruise_lrModel.evaluate(test_data)

In [45]:
#View Root Mean Squared Error
cruise_results.rootMeanSquaredError

0.8224301488836313

In [47]:
#View summary statistics for train_data to put RMSE in context
train_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              107|
|   mean|8.140934579439264|
| stddev|3.555132227647397|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



In [48]:
cruise_results.r2

0.9369930763910599

In [49]:
cruise_results.meanSquaredError

0.676391349792752

In [50]:
cruise_results.meanAbsoluteError

0.655349199248137

## Results: The root RMSE is good in the context of the mean being 8.1 and standard deviation being 3.6.  The R squared is high at .93.  This very high R squared is supported by the fact that is a very high correlation, passengers=.92 and cabins=.96, between many of the independent variables and the dependent variables.  It is also makes sense that there would be a high correlation with the number of passengers and the number of cabins and the size of the crew.

In [51]:
# Print the coefficients and intercept for linear regression
print("Coefficients: {} Intercept: {}".format(cruise_lrModel.coefficients,cruise_lrModel.intercept))

Coefficients: [-0.0258516394192,0.0072186978497,-0.199302680543,0.423682757471,0.968598434829,-0.0265939132917,0.0636395582079] Intercept: 0.19976051462048117


In [52]:
from pyspark.sql.functions import corr

In [53]:
data.select(corr('crew','passengers')).show()

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+



In [54]:
data.select(corr('crew','cabins')).show()

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+

