In [0]:
import pyspark
from pyspark.sql import SparkSession    
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
import numpy as np

In [0]:
spark = SparkSession.builder.appName('cruise_ship_linear_regression').getOrCreate()
data = spark.read.csv('/FileStore/tables/CourseDATA/cruise_ship_info.csv' , inferSchema=True , header = True)

In [0]:
data.show(5)

In [0]:
data.columns

In [0]:

indexer = StringIndexer(inputCol="Cruise_line", outputCol="Cruise_Line_Index")
data_indexed = indexer.fit(data).transform(data)


data_indexed.select("Cruise_Line_Index").distinct().count()#20 different types of cruiselines have been indexed


In [0]:
assembler = VectorAssembler(
 inputCols=[ "Cruise_Line_Index",
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density'] ,
  outputCol="Features"
                           )

output = assembler.transform(data_indexed)

In [0]:
output.columns

In [0]:
final_data = output.select(["features" , "crew"])


In [0]:
train_data , test_data = final_data.randomSplit([0.7,0.3],24)

In [0]:
lr = LinearRegression(labelCol='crew')

In [0]:
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.18,0.10,0.05]).addGrid(lr.fitIntercept, [False, True]).addGrid(lr.elasticNetParam, [0.5 , 0.3, 1, 0.7,0]).addGrid(lr.maxIter, [11,7,8]).addGrid(lr.standardization,[False,True]).addGrid(lr.solver,["auto", "normal", "l-bfgs"]).build()

tvs = TrainValidationSplit(estimator=lr,
                           collectSubModels=True,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(labelCol='crew'),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

lr_model = tvs.fit(train_data)

In [0]:
best_cruise_model = lr_model.bestModel
best_cruise_model = best_cruise_model.evaluate(test_data)

In [0]:
best_cruise_model.r2

In [0]:
best_cruise_model.rootMeanSquaredError

In [0]:
best_cruise_model.meanSquaredError

In [0]:
lr_model.bestModel.extractParamMap()

In [0]:
default_model = lr.fit(train_data)

In [0]:
default_results = default_model.evaluate(test_data)

print(default_results.rootMeanSquaredError,
default_results.r2,
default_results.meanSquaredError,
default_results.meanAbsoluteError)