In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('lr_project').getOrCreate()

In [3]:
data = spark.read.csv('/FileStore/tables/cruise_ship_info.csv', inferSchema=True, header=True)

In [4]:
data.show()

In [5]:
data.select('Cruise_line').groupBy('Cruise_line').count().show()

In [6]:
data.columns

In [7]:
from pyspark.ml.feature import StringIndexer

stringIndexer = StringIndexer(inputCol="Cruise_line", outputCol="indexed_cruise_line")

In [8]:
data = stringIndexer.fit(data).transform(data)

In [9]:
data.show()

In [10]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [11]:
assembler = VectorAssembler(inputCols = ['indexed_cruise_line', 'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density'], 
                            outputCol = 'features')

In [12]:
output = assembler.transform(data)

In [13]:
output.printSchema()

In [14]:
# feature column has all the feature values for a row in a list format. Seen in output.head()
output.select('features').show()

In [15]:
type(output.select('features'))

In [16]:
output.head()

In [17]:
final_data = output.select('features', 'crew')
final_data.show()

In [18]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [19]:
from pyspark.ml.regression import LinearRegression

In [20]:
lr = LinearRegression(labelCol='crew')

In [21]:
lr_model = lr.fit(train_data)

In [22]:
test_results = lr_model.evaluate(test_data)

In [23]:
test_results.residuals.show()

In [24]:
test_results.rootMeanSquaredError

In [25]:
test_results.r2

In [26]:
final_data.describe().show()

In [27]:
test_results.meanAbsoluteError

In [28]:
from pyspark.sql.functions import corr

In [29]:
data.describe().show()

In [30]:
data.select(corr('crew', 'passengers')).show()

In [31]:
data.select(corr('crew', 'cabins')).show()