## Using pyspark to create a linear regression model to predict the number of crew members a cruise line will need. 

By: Matt Purvis

In [0]:
# Import SparkSession
from pyspark.sql import SparkSession

In [0]:
# Create SparkSession
spark = SparkSession.builder.appName('lreg').getOrCreate()

In [0]:
# Import LinearRegression
from pyspark.ml.regression import LinearRegression

In [0]:
# Read in the data

df = spark.read.csv('dbfs:/FileStore/tables/cruise_ship_info.csv', inferSchema = True, header = True)

In [0]:
# Preview the data
df.show()

In [0]:
# Import StringIndexer to create a dummy variable for Cruise_Line
from pyspark.ml.feature import StringIndexer

In [0]:
# Create indexer object
indexer = StringIndexer(inputCol="Cruise_line", outputCol="CruiseIndex")

In [0]:
# Perform the dummy variable encoding
indexed = indexer.fit(df).transform(df)

In [0]:
# Show the resulting df
indexed.show()

In [0]:
# Import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
# Show column names from encoded df
indexed.columns

In [0]:
# Create an assembler object to transform the features into a features vector
assembler = VectorAssembler(inputCols=['Age','Tonnage','passengers','length','cabins','passenger_density','CruiseIndex'], outputCol = 'features') 

In [0]:
# Perform the transformation
output = assembler.transform(indexed)

In [0]:
# Grab only the new features column and the labels
final_data = output.select(['features', 'crew'])

In [0]:
# Display the final_data df
final_data.show()

In [0]:
# Split into train and test sets
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [0]:
# Describe the training set
train_data.describe().show()

In [0]:
# Describe the test set
test_data.describe().show()

In [0]:
# Create linear regression model and indicate label column
lr = LinearRegression(labelCol = 'crew')

In [0]:
# Fit the model to the training data
lr_model = lr.fit(train_data)

In [0]:
# Evaluate the model on the test set
test_results = lr_model.evaluate(test_data)

In [0]:
# Get the rmse 
test_results.rootMeanSquaredError

In [0]:
# Compare rmse to mean and std dev - indicates that our rmse is not bad!
df.select('crew').describe().show()

In [0]:
# Display the explained variance for the model - not bad at all!
test_results.r2

In [0]:
# Demonstrate how we can use the model to predict unkown crew member numbers for unlabeled data
unlabeled_data = test_data.select('features')

In [0]:
# Demonstrate how we can use the model to predict unkown crew member numbers for unlabeled data
predictions = lr_model.transform(unlabeled_data)

In [0]:
# Demonstrate how we can use the model to predict unkown crew member numbers for unlabeled data
predictions.show()

In [0]:
# Look at correlations to determine if our model results are realistic or not
from pyspark.sql.functions import corr

In [0]:
# Correlation for passengers is really high
df.select(corr('crew','passengers')).show()

In [0]:
# Correlation for cabins is really high
df.select(corr('crew','cabins')).show()

In [0]:
# Our label column is highly correlated to features. Therefore we can be confident that our model is realistic and is a good model. 