**Research Question**: Create a regression model that will help predict how many crew members will be needed for future ships.

In [16]:
import findspark
findspark.init('/home/shashank/spark-2.3.2-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr_codealong').getOrCreate()
from pyspark.ml.regression import LinearRegression

In [17]:
data = spark.read.csv("cruise_ship_info.csv", inferSchema=True, header=True)

# Explore Data

In [18]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [19]:
data.count()

158

In [20]:
data.createOrReplaceTempView('data_sql')

In [21]:
spark.sql("FROM data_sql SELECT *").show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [22]:
#convert string to dummies
import pyspark.sql.functions as F
data_cat = data.select('Cruise_line').distinct().rdd.flatMap(lambda x:x).collect()
exprs = [F.when(F.col('Cruise_line') == i, 1).otherwise(0).alias(str(i)) for i in data_cat]
data = data.select(exprs+data.columns)
data.head()

Row(Costa=0, P&O=0, Cunard=0, Regent_Seven_Seas=0, MSC=0, Carnival=0, Crystal=0, Orient=0, Princess=0, Silversea=0, Seabourn=0, Holland_American=0, Windstar=0, Disney=0, Norwegian=0, Oceania=0, Azamara=1, Celebrity=0, Star=0, Royal_Caribbean=0, Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)

In [23]:
data.printSchema()

root
 |-- Costa: integer (nullable = false)
 |-- P&O: integer (nullable = false)
 |-- Cunard: integer (nullable = false)
 |-- Regent_Seven_Seas: integer (nullable = false)
 |-- MSC: integer (nullable = false)
 |-- Carnival: integer (nullable = false)
 |-- Crystal: integer (nullable = false)
 |-- Orient: integer (nullable = false)
 |-- Princess: integer (nullable = false)
 |-- Silversea: integer (nullable = false)
 |-- Seabourn: integer (nullable = false)
 |-- Holland_American: integer (nullable = false)
 |-- Windstar: integer (nullable = false)
 |-- Disney: integer (nullable = false)
 |-- Norwegian: integer (nullable = false)
 |-- Oceania: integer (nullable = false)
 |-- Azamara: integer (nullable = false)
 |-- Celebrity: integer (nullable = false)
 |-- Star: integer (nullable = false)
 |-- Royal_Caribbean: integer (nullable = false)
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = t

In [24]:
#Import Vectors and Vector Assembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [25]:
data.columns

['Costa',
 'P&O',
 'Cunard',
 'Regent_Seven_Seas',
 'MSC',
 'Carnival',
 'Crystal',
 'Orient',
 'Princess',
 'Silversea',
 'Seabourn',
 'Holland_American',
 'Windstar',
 'Disney',
 'Norwegian',
 'Oceania',
 'Azamara',
 'Celebrity',
 'Star',
 'Royal_Caribbean',
 'Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [26]:
assemble = VectorAssembler(inputCols=['Costa', 'P&O',  'Cunard',  'Regent_Seven_Seas',  'MSC',  'Carnival',  'Crystal', \
 'Orient',  'Princess',  'Silversea',  'Seabourn',  'Holland_American',  'Windstar',  'Disney','Norwegian', 'Oceania', \
 'Azamara',  'Celebrity', 'Star', 'Royal_Caribbean', 'Age', 'Tonnage', 'passengers', 'length', 'cabins', \
                                      'passenger_density'], outputCol= 'features')

In [27]:
output = assemble.transform(data)

In [28]:
output.printSchema() #features vector present

root
 |-- Costa: integer (nullable = false)
 |-- P&O: integer (nullable = false)
 |-- Cunard: integer (nullable = false)
 |-- Regent_Seven_Seas: integer (nullable = false)
 |-- MSC: integer (nullable = false)
 |-- Carnival: integer (nullable = false)
 |-- Crystal: integer (nullable = false)
 |-- Orient: integer (nullable = false)
 |-- Princess: integer (nullable = false)
 |-- Silversea: integer (nullable = false)
 |-- Seabourn: integer (nullable = false)
 |-- Holland_American: integer (nullable = false)
 |-- Windstar: integer (nullable = false)
 |-- Disney: integer (nullable = false)
 |-- Norwegian: integer (nullable = false)
 |-- Oceania: integer (nullable = false)
 |-- Azamara: integer (nullable = false)
 |-- Celebrity: integer (nullable = false)
 |-- Star: integer (nullable = false)
 |-- Royal_Caribbean: integer (nullable = false)
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = t

# Select final data

In [29]:
output.createOrReplaceTempView('output_sql')

In [30]:
final_data = spark.sql("FROM output_sql SELECT features, crew AS label")

In [31]:
final_data.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(26,[16,20,21,22,...| 3.55|
|(26,[16,20,21,22,...| 3.55|
|(26,[5,20,21,22,2...|  6.7|
|(26,[5,20,21,22,2...| 19.1|
|(26,[5,20,21,22,2...| 10.0|
|(26,[5,20,21,22,2...|  9.2|
|(26,[5,20,21,22,2...|  9.2|
|(26,[5,20,21,22,2...|  9.2|
|(26,[5,20,21,22,2...|  9.2|
|(26,[5,20,21,22,2...| 11.5|
|(26,[5,20,21,22,2...| 11.6|
|(26,[5,20,21,22,2...|  6.6|
|(26,[5,20,21,22,2...|  9.2|
|(26,[5,20,21,22,2...|  9.2|
|(26,[5,20,21,22,2...|  9.3|
|(26,[5,20,21,22,2...| 11.6|
|(26,[5,20,21,22,2...| 10.3|
|(26,[5,20,21,22,2...|  9.2|
|(26,[5,20,21,22,2...|  9.3|
|(26,[5,20,21,22,2...|  9.2|
+--------------------+-----+
only showing top 20 rows



In [33]:
#Create train, test data

train,test = final_data.randomSplit([0.7,0.3])
print("Train Count: {}".format(train.count()))
print("Test Count: {}".format(test.count()))

Train Count: 114
Test Count: 44


In [34]:
print(train.describe().show())
print(test.describe().show())

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|               114|
|   mean|   7.8961403508772|
| stddev|3.6882023126058834|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+

None
+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|                44|
|   mean| 7.529999999999998|
| stddev|2.9949849555082793|
|    min|              0.59|
|    max|              13.6|
+-------+------------------+

None


In [36]:
#linear regression model
lr = LinearRegression()
lr.model = lr.fit(train)

In [37]:
test_results = lr.model.evaluate(test)

In [39]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("R-square {}".format(test_results.r2))

RMSE: 0.7625538653682009
R-square 0.9336660308807528


In [43]:
data.select('crew').describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              158|
|   mean|7.794177215189873|
| stddev|3.503486564627034|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



**Compared to mean and SD, the RMSE looks good**

In [44]:
#Explore really high R2, - high correlations?
from pyspark.sql.functions import corr

In [46]:
spark.sql("FROM data_sql SELECT corr(passengers, crew)").show() #high

+----------------------+
|corr(passengers, crew)|
+----------------------+
|    0.9152341306065384|
+----------------------+



In [48]:
spark.sql("FROM data_sql SELECT corr(cabins, crew)").show() #high

+------------------+
|corr(cabins, crew)|
+------------------+
|0.9508226063578497|
+------------------+



In [49]:
spark.sql("FROM data_sql SELECT corr(Age, crew)").show() #ok

+-------------------------------+
|corr(CAST(Age AS DOUBLE), crew)|
+-------------------------------+
|            -0.5306565039638852|
+-------------------------------+



In [50]:
spark.sql("FROM data_sql SELECT corr(Tonnage, crew)").show() #high

+-------------------+
|corr(Tonnage, crew)|
+-------------------+
| 0.9275688115449388|
+-------------------+



In [51]:
spark.sql("FROM data_sql SELECT corr(length, crew)").show() #high

+------------------+
|corr(length, crew)|
+------------------+
| 0.895856627101658|
+------------------+



The model does a good job of predicting the number of crew required based on different characteristics. <br>
We also notice that the correlation for Passengers, Tonnage, Length, and Cabins is high and positive with number of crew members.