In [1]:
import warnings
warnings.filterwarnings(action="ignore")

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer
from pyspark.ml.regression import LinearRegression

In [2]:
spark = SparkSession.builder.appName("huyn").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/17 21:00:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## The data

Create a regression model that will help predict how many crew members will be needed for future ships. The client also mentioned that they have found that particular cruise lines will differ in acceptable crew counts, so it is most likely an important feature to include in our analysis!

In [3]:
data = spark.read.csv("cruise_ship_info.csv", header=True, inferSchema=True)

In [4]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [5]:
# data.show()

data.select(
    "Ship_name",
    "Cruise_line",
    "Age",
    F.format_number("Tonnage",2).alias("Tonnage"),
    "passengers",
    "length",
    "cabins",
    "passenger_density",
    "crew"
).show()

+-----------+-----------+---+-------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+-------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|  30.28|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|  30.28|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|  47.26|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11| 110.00|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17| 101.35|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|  70.37|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|  70.37|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23|  70.37|     20.56|  8.55| 10.22|            34.23| 9.2|
|Fascination|   Carnival| 19|  70.37|     20.52|  8.55

## Data Preprocessing & Feature Selection: Setting Up DataFrame for Machine Learning

In [6]:
# A few things we need to do before Spark can accept the data!
# It needs to be in the form of two columns
# ("label","features")

# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [7]:
data.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

Ship_name is a useless arbitrary string, but the Cruise_line itself may be useful. Let's make it into a categorical variable!

In [8]:
data.groupBy('Cruise_line').count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



In [9]:
indexer = StringIndexer(inputCol="Cruise_line", outputCol="CruiseLineIndex")

data = indexer.fit(data).transform(data)

data.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|CruiseLineIndex|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|            1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|            1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|            1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|

In [10]:
assembler = VectorAssembler(
    inputCols=["Age", "Tonnage", "passengers", "length", "cabins", "passenger_density", "CruiseLineIndex"],
    outputCol="features")

In [11]:
output = assembler.transform(data)

In [12]:
output.select("features").show()

+--------------------+
|            features|
+--------------------+
|[6.0,30.276999999...|
|[6.0,30.276999999...|
|[26.0,47.262,14.8...|
|[11.0,110.0,29.74...|
|[17.0,101.353,26....|
|[22.0,70.367,20.5...|
|[15.0,70.367,20.5...|
|[23.0,70.367,20.5...|
|[19.0,70.367,20.5...|
|[6.0,110.23899999...|
|[10.0,110.0,29.74...|
|[28.0,46.052,14.5...|
|[18.0,70.367,20.5...|
|[17.0,70.367,20.5...|
|[11.0,86.0,21.24,...|
|[8.0,110.0,29.74,...|
|[9.0,88.5,21.24,9...|
|[15.0,70.367,20.5...|
|[12.0,88.5,21.24,...|
|[20.0,70.367,20.5...|
+--------------------+
only showing top 20 rows



In [13]:
output.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)
 |-- CruiseLineIndex: double (nullable = false)
 |-- features: vector (nullable = true)



In [14]:
final_data = output.select("features", "crew")

## Split the data into training and testing sets

In [15]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [16]:
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                99|
|   mean| 7.866666666666674|
| stddev|3.4050044202674434|
|    min|              0.59|
|    max|              19.1|
+-------+------------------+



In [17]:
test_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|               59|
|   mean|7.672542372881357|
| stddev|3.689390938063252|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



## Creating a Linear Regression Model

In [18]:
# Create a Linear Regression Model object
lr = LinearRegression(labelCol="crew")

In [19]:
# Fit the model to the data and call this model lrModel
lrModel = lr.fit(train_data,)

24/01/17 21:00:31 WARN Instrumentation: [a5e655af] regParam is zero, which might cause numerical instability and overfitting.
24/01/17 21:00:32 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/01/17 21:00:32 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/01/17 21:00:32 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [20]:
# Print the coefficients and intercept for linear regression
print("Coefficients: {}\nIntercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [-0.018631662495977308,0.016581904790028204,-0.16830613384718454,0.3446809086021726,0.8777549074449205,-0.008809001460268638,0.04581917703254456]
Intercept: -0.41722479781047217


## Evaluate Performance

In [21]:
test_results = lrModel.evaluate(test_data)

In [22]:
# Interesting results....
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|-0.48236849946304616|
| -1.3488017455271812|
| -1.8202747897422054|
| -0.7759165184231192|
| -1.1054809965841326|
| -1.2912303503045948|
|  0.4157425729688349|
|  0.6724511161838453|
| -0.7573484623440851|
|  1.7026884358037346|
|-0.28640915239768905|
|-0.12126430392104925|
| -0.6651488110474251|
|  -0.259225418572246|
|  0.9135022914899587|
|-0.39731094607502015|
|  0.7584712967450677|
|  0.7742282900929442|
| -1.3670509996968079|
|  0.7512655246570823|
+--------------------+
only showing top 20 rows



In [23]:
unlabeled_data = test_data.select('features')

In [24]:
predictions = lrModel.transform(unlabeled_data)

In [25]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[4.0,220.0,54.0,1...|21.482368499463046|
|[5.0,86.0,21.04,9...| 9.348801745527181|
|[5.0,160.0,36.34,...|15.420274789742205|
|[6.0,30.276999999...| 4.325916518423119|
|[6.0,90.0,20.0,9....|10.105480996584133|
|[7.0,89.6,25.5,9....|11.161230350304594|
|[9.0,81.0,21.44,9...| 9.584257427031165|
|[9.0,88.5,21.24,9...| 9.627548883816155|
|[9.0,105.0,27.2,8...|11.437348462344085|
|[10.0,46.0,7.0,6....| 2.767311564196265|
|[10.0,58.825,15.6...| 7.286409152397689|
|[10.0,68.0,10.8,7...|  6.48126430392105|
|[10.0,110.0,29.74...|12.265148811047425|
|[11.0,86.0,21.24,...| 9.559225418572247|
|[11.0,90.0,22.4,9...|10.086497708510041|
|[11.0,90.09,25.01...|  8.87731094607502|
|[11.0,91.0,20.32,...| 9.231528703254932|
|[11.0,108.977,26....|11.225771709907056|
|[11.0,138.0,31.14...|13.217050999696808|
|[12.0,77.104,20.0...| 8.838734475342918|
+--------------------+------------

In [26]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))
print("R^2: {}".format(test_results.r2))

RMSE: 1.0161479994913707
MSE: 1.0325567568703147
R^2: 0.9228335364223914


In [39]:
# Check Pearson coefficients
for col in data.columns[2:]:
    data.select(F.corr('crew',col)).show()

+-------------------+
|    corr(crew, Age)|
+-------------------+
|-0.5306565039638852|
+-------------------+

+-------------------+
|corr(crew, Tonnage)|
+-------------------+
|  0.927568811544939|
+-------------------+

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+

+------------------+
|corr(crew, length)|
+------------------+
|0.8958566271016579|
+------------------+

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+

+-----------------------------+
|corr(crew, passenger_density)|
+-----------------------------+
|         -0.15550928421699717|
+-----------------------------+

+----------------+
|corr(crew, crew)|
+----------------+
|             1.0|
+----------------+

+---------------------------+
|corr(crew, CruiseLineIndex)|
+---------------------------+
|        -0.5154627327113319|
+---------------------------+

