In [None]:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

### Our task is to develop a regression model that will predict the number of  crew members required for future ships from the given features.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder,Imputer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

### Read the data Crew.csv into spark dataframe
- inferSchema=True and header=True.
- Print the schema and show the first few rows.
- Use df.describe() to see the statistical properties of the data.

In [None]:
df=spark.read.csv("/content/drive/MyDrive/Data (1)/Data/Crew.csv",inferSchema=True,header=True)

In [None]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [None]:
df.show(20)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [None]:
df.describe().show()

+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|summary|Ship_name|Cruise_line|               Age|           Tonnage|       passengers|           length|            cabins|passenger_density|             crew|
+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|  count|      158|        158|               158|               158|              158|              158|               158|              158|              158|
|   mean| Infinity|       NULL|15.689873417721518| 71.28467088607599|18.45740506329114|8.130632911392404| 8.830000000000005|39.90094936708861|7.794177215189873|
| stddev|     NULL|       NULL| 7.615691058751413|37.229540025907866|9.677094775143416|1.793473548054825|4.4714172221480615| 8.63921711391542|3.503486564627034|
|    min|Adventure|    Azamara|   

### StringIndexer and OneHotEncoder
- Create StringIndexer and OneHotEncoder to process the data.
- StringIndexer is for any string data type.
- OneHotEncoder will be applied to the StringIndexer columns.
- Convert all obtained columns from OneHotEncoder and the other numeric columns into a feature column (use VectorAssembler)

In [None]:
dtypes=df.dtypes

In [None]:
dtypes

[('Ship_name', 'string'),
 ('Cruise_line', 'string'),
 ('Age', 'int'),
 ('Tonnage', 'double'),
 ('passengers', 'double'),
 ('length', 'double'),
 ('cabins', 'double'),
 ('passenger_density', 'double'),
 ('crew', 'double')]

## StringIndexer

In [None]:
CatCols= [ s for (s,d) in dtypes if d=="string"]
CatCols

['Ship_name', 'Cruise_line']

In [None]:
catCols_indexed= [ s+"_indexed" for s in CatCols]
catCols_indexed

['Ship_name_indexed', 'Cruise_line_indexed']

In [None]:
stind=StringIndexer(inputCols=CatCols,outputCols=catCols_indexed,handleInvalid="keep")


## OneHotEncoder

In [None]:
catCols_ohe= [ s+"_ohe" for s in CatCols]
catCols_ohe

['Ship_name_ohe', 'Cruise_line_ohe']

In [None]:
ohe=OneHotEncoder(inputCols=catCols_indexed,outputCols=catCols_ohe)

In [None]:
numCols= [ s for (s,d) in dtypes if d!="string"]
numCols

['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

# Imputing

In [None]:
imput=Imputer(inputCols=numCols,outputCols=numCols)

## collect categorical and number columns

In [None]:
vec_Cols=catCols_ohe+numCols
vec_Cols

['Ship_name_ohe',
 'Cruise_line_ohe',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [None]:
final_cols=['Ship_name_ohe',
 'Cruise_line_ohe',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density']

## VectorAssembler

In [None]:
vecAssem=VectorAssembler(inputCols=final_cols,outputCol="features")

VectorAssembler_296304a98990

### Divide the data into Train/Test

In [None]:
train_df, test_df=df.randomSplit([0.8,0.2],seed=42)
print(f"There are {train_df.count()} rows in the training set, and {test_df.count()} in the test set")

There are 133 rows in the training set, and 25 in the test set


### Create a Linear Regression Model

In [None]:
lr=LinearRegression(featuresCol="features",labelCol="crew",predictionCol="prediction")

### Create a Pipeline model

In [None]:
pipe=Pipeline(stages=[stind,ohe,imput,vecAssem,lr])

### Fit the Pipeline model to the trainig data

In [None]:
pipe_model=pipe.fit(train_df)

### Make a prediction for the same training data and evaluate the model performance using RMSE and r2

In [None]:
pred_train_df=pipe_model.transform(train_df)

In [None]:
pred_train_df.show(5)

+---------+---------------+---+-------+----------+------+------+-----------------+-----+-----------------+-------------------+----------------+---------------+--------------------+------------------+
|Ship_name|    Cruise_line|Age|Tonnage|passengers|length|cabins|passenger_density| crew|Ship_name_indexed|Cruise_line_indexed|   Ship_name_ohe|Cruise_line_ohe|            features|        prediction|
+---------+---------------+---+-------+----------+------+------+-----------------+-----+-----------------+-------------------+----------------+---------------+--------------------+------------------+
|Adventure|Royal_Caribbean| 12|  138.0|     31.14|  10.2| 15.57|            44.32|11.85|             12.0|                1.0|(118,[12],[1.0])| (19,[1],[1.0])|(143,[12,119,137,...|11.849778861198041|
|  Allegra|          Costa| 21|  28.43|      8.08|  6.16|   4.1|            35.19|  4.0|             13.0|                5.0|(118,[13],[1.0])| (19,[5],[1.0])|(143,[13,123,137,...|4.0005990286325455|


In [None]:
rmse_evaluator_train=RegressionEvaluator(predictionCol="prediction",labelCol="crew",metricName="rmse")
rmse_evaluator_train.evaluate(pred_train_df)

0.07108459353708453

In [None]:
r2_evaluator_train=RegressionEvaluator(predictionCol="prediction",labelCol="crew",metricName="r2")
r2_evaluator_train.evaluate(pred_train_df)

0.9995973796114396

### Make a prediction for the test data and evaluate the model performance using RMSE and r2

In [None]:
pred_test_df=pipe_model.transform(test_df)

In [None]:
pred_test_df.show(5)

+---------+----------------+---+-------+----------+------+------+-----------------+----+-----------------+-------------------+----------------+---------------+--------------------+------------------+
|Ship_name|     Cruise_line|Age|Tonnage|passengers|length|cabins|passenger_density|crew|Ship_name_indexed|Cruise_line_indexed|   Ship_name_ohe|Cruise_line_ohe|            features|        prediction|
+---------+----------------+---+-------+----------+------+------+-----------------+----+-----------------+-------------------+----------------+---------------+--------------------+------------------+
|Amsterdam|Holland_American| 13|   61.0|      13.8|   7.8|  6.88|             44.2| 6.0|            118.0|                3.0|     (118,[],[])| (19,[3],[1.0])|(143,[121,137,138...| 6.043648879259668|
|  Artemis|             P&O| 29|   45.0|     11.78|  7.54|   5.3|             38.2| 5.2|            118.0|               10.0|     (118,[],[])|(19,[10],[1.0])|(143,[128,137,138...|3.6513892147390923|


In [None]:
rmse_evaluator_test=RegressionEvaluator(predictionCol="prediction",labelCol="crew",metricName="rmse")
rmse_evaluator_test.evaluate(pred_test_df)

1.9393240955972992

In [None]:
r2_evaluator_test=RegressionEvaluator(predictionCol="prediction",labelCol="crew",metricName="r2")
r2_evaluator_test.evaluate(pred_test_df)

0.6291305506372632

In [None]:
print(f"RMSE is {rmse_evaluator_test.evaluate(pred_test_df)}")
print(f"R2 is {r2_evaluator_test.evaluate(pred_test_df)}")

RMSE is 1.9393240955972992
R2 is 0.6291305506372632
