In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("lrex").getOrCreate()

# Load training data
df = spark.read.format("csv").load("dbfs:/FileStore/shared_uploads/necmettinceylan@hotmail.com/cruise_ship_info.csv",
                                  header= True,
                                  inferSchema=True)
df.columns

In [0]:
df.printSchema()

In [0]:
df.head(4)

In [0]:
from pyspark.ml.feature import StringIndexer

In [0]:
indexer = StringIndexer( inputCol="Cruise_line", outputCol="cruise_cat" )
indexed = indexer.fit( df ).transform(df)
indexed.head(1)

In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler


In [0]:
indexed.columns

In [0]:
assembler = VectorAssembler(inputCols=['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'cruise_cat'], outputCol="features")

output = assembler.transform(indexed)
output.columns

In [0]:
output.select("features","crew").show()

In [0]:
final_data = output.select( ["features","crew" ])

In [0]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [0]:
train_data.describe().show()

In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
ship_lr = LinearRegression( labelCol="crew" )

In [0]:
trained_ship_model = ship_lr.fit(train_data)

In [0]:
test_ev = trained_ship_model.evaluate(test_data)
test_ev.rootMeanSquaredError

In [0]:
from pyspark.sql.functions import corr

In [0]:
df.select( corr("crew","passengers") ).show()

In [0]:
df.select( corr("crew","cabins") ).show()