In [1]:
#spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName ("Cruise Ships Regression").getOrCreate ()

In [2]:
#loading the dataset and preparing to machine learning
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler

In [3]:
data = spark.read.csv ("cruise_ship_info.csv", inferSchema=True, header=True)

In [4]:
indexer = StringIndexer (inputCol="Cruise_line", outputCol="line")
data = indexer.fit (data).transform (data)
data = data.select (["line", "Age", "Tonnage", "passengers", "length", "cabins", "passenger_density", "crew"])

In [5]:
features = ["line", "Age", "Tonnage", "passengers", "length", "cabins", "passenger_density", "crew"]
data = data.select (features)

In [6]:
assembler = VectorAssembler (inputCols=["line", "Age", "Tonnage", "passengers", "length", "cabins", "passenger_density"], outputCol="features")
df = assembler.transform (data).select ("features", "crew")
df.show ()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[16.0,6.0,30.2769...|3.55|
|[16.0,6.0,30.2769...|3.55|
|[1.0,26.0,47.262,...| 6.7|
|[1.0,11.0,110.0,2...|19.1|
|[1.0,17.0,101.353...|10.0|
|[1.0,22.0,70.367,...| 9.2|
|[1.0,15.0,70.367,...| 9.2|
|[1.0,23.0,70.367,...| 9.2|
|[1.0,19.0,70.367,...| 9.2|
|[1.0,6.0,110.2389...|11.5|
|[1.0,10.0,110.0,2...|11.6|
|[1.0,28.0,46.052,...| 6.6|
|[1.0,18.0,70.367,...| 9.2|
|[1.0,17.0,70.367,...| 9.2|
|[1.0,11.0,86.0,21...| 9.3|
|[1.0,8.0,110.0,29...|11.6|
|[1.0,9.0,88.5,21....|10.3|
|[1.0,15.0,70.367,...| 9.2|
|[1.0,12.0,88.5,21...| 9.3|
|[1.0,20.0,70.367,...| 9.2|
+--------------------+----+
only showing top 20 rows



In [7]:
#train-test-splitting
(train, test) = df.randomSplit ([0.7, 0.3], seed=0)

In [8]:
#fitting the model to the training data
from pyspark.ml.regression import LinearRegression

lr = LinearRegression (featuresCol="features", labelCol="crew", predictionCol="prediction")
model = lr.fit (train)


In [9]:
#testing the model
test_results = model.evaluate (test)
print ("Root Mean Squared Error: ", test_results.rootMeanSquaredError)
print ("R2 Coeffcient:", test_results.r2)

print (model.intercept)
print (model.coefficients)


Root Mean Squared Error:  0.7175117976753568
R2 Coeffcient: 0.9614970428267888
-0.702304920029873
[0.02983426033990435,-0.013010363973426111,0.01020005625414865,-0.15569501178535697,0.42847792417321706,0.8416762689599384,-0.004919511460409775]


In [10]:
#making new predictions
utest = test.select ("features")
pred = model.transform (utest)

In [11]:
#plotting results
#import numpy as np
#import matplotlib.pyplot as plt

#reshaping test vector for plotting

#test_reshaped = test.select ("features").collect ()
#for i in range (len (test_reshaped)):
 # test_reshaped[i] = test_reshaped [i][0][0]

#x = np.array (data.select ("passengers").collect ())
#y = np.array (data.select ("crew").collect ())
#y_pred = np.array (pred.select ("prediction").collect ())

#plt.scatter (x, y, color="blue")
#plt.plot (test_reshaped, y_pred, color="red")
#plt.show ()

In [28]:
#linear regression using the sklearn library
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_squared_error
import pandas as pd

sk_data = pd.read_csv ("cruise_ship_info.csv")

x = sk_data [["Cruise_line", "Age", "Tonnage", "passengers", "length", "cabins"]].to_numpy ()
y = sk_data ["crew"].to_numpy ()

label_enc = LabelEncoder ()
x[:, 0] = label_enc.fit_transform (x[:, 0])
one_hot = OneHotEncoder (categorical_features=[0])
#x = one_hot.fit_transform (x).toarray ()

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size=0.3, random_state=0)

sk_model = LinearRegression ()
sk_model.fit (x_train, y_train)
sk_pred = sk_model.predict (x_test)
print ("Root Mean Squared Error:", mean_squared_error (y_test, sk_pred))
print ("R2 Coefficient:", sk_model.score (x_test, y_test))

print (sk_model.intercept_)
print (sk_model.coef_)

#plt.scatter (x, y, color="blue")
#plt.plot (x_test, sk_pred, color="red")
#plt.show ()

Root Mean Squared Error: 0.41206721822917464
R2 Coefficient: 0.9590685237015221
-0.2261587040466555
[-0.034262   -0.00795115  0.01337923 -0.13462169  0.40571745  0.7587587 ]
