In [2]:
#spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName ("Cruise Ships Regression").getOrCreate ()

In [3]:
data = spark.read.csv ("cruise_ship_info.csv", inferSchema=True, header=True)
data.printSchema ()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [4]:
#loading the dataset and labeling categorical variables
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer (inputCol="Cruise_line", outputCol="line")
indexed = indexer.fit (data).transform (data)

features = ["line", "Age", "Tonnage", "passengers", "length", "cabins", "passenger_density", "crew"]
df = indexed.select (features)

In [5]:
#one hot encoding for categorical values
from pyspark.ml.feature import OneHotEncoderEstimator

encoder = OneHotEncoderEstimator (inputCols=["line"], outputCols=["line-vec"])
df = encoder.fit (df).transform (df)
df

DataFrame[line: double, Age: int, Tonnage: double, passengers: double, length: double, cabins: double, passenger_density: double, crew: double, line-vec: vector]

In [7]:
#preparing data for machine learning
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler (inputCols=["line-vec", "Age", "Tonnage", "passengers", "length", "cabins", "passenger_density"], outputCol="features")
df = assembler.transform (df).select ("features", "crew")
df.select ("features").toPandas ()
df.printSchema ()

root
 |-- features: vector (nullable = true)
 |-- crew: double (nullable = true)



In [14]:
lista = df.select ("features").collect ()
for i in lista:
    print (i[0])
    print ("\n")

(25,[16,19,20,21,22,23,24],[1.0,6.0,30.276999999999997,6.94,5.94,3.55,42.64])


(25,[16,19,20,21,22,23,24],[1.0,6.0,30.276999999999997,6.94,5.94,3.55,42.64])


(25,[1,19,20,21,22,23,24],[1.0,26.0,47.262,14.86,7.22,7.43,31.8])


(25,[1,19,20,21,22,23,24],[1.0,11.0,110.0,29.74,9.53,14.88,36.99])


(25,[1,19,20,21,22,23,24],[1.0,17.0,101.353,26.42,8.92,13.21,38.36])


(25,[1,19,20,21,22,23,24],[1.0,22.0,70.367,20.52,8.55,10.2,34.29])


(25,[1,19,20,21,22,23,24],[1.0,15.0,70.367,20.52,8.55,10.2,34.29])


(25,[1,19,20,21,22,23,24],[1.0,23.0,70.367,20.56,8.55,10.22,34.23])


(25,[1,19,20,21,22,23,24],[1.0,19.0,70.367,20.52,8.55,10.2,34.29])


(25,[1,19,20,21,22,23,24],[1.0,6.0,110.23899999999999,37.0,9.51,14.87,29.79])


(25,[1,19,20,21,22,23,24],[1.0,10.0,110.0,29.74,9.51,14.87,36.99])


(25,[1,19,20,21,22,23,24],[1.0,28.0,46.052,14.52,7.27,7.26,31.72])


(25,[1,19,20,21,22,23,24],[1.0,18.0,70.367,20.52,8.55,10.2,34.29])


(25,[1,19,20,21,22,23,24],[1.0,17.0,70.367,20.52,8.55,10.2,34.29])



In [15]:
#train-test-splitting
(train, test) = df.randomSplit ([0.7, 0.3])

In [16]:
#fitting the model to the training data
from pyspark.ml.regression import LinearRegression

lr = LinearRegression (featuresCol="features", labelCol="crew", predictionCol="prediction")
model = lr.fit (train)

In [17]:
#evaluating the model
results = model.evaluate (test)
print ("Root Mean Squared Error:", results.meanSquaredError)
print ("R2 Coefficient:", results.r2)
print (model.intercept)
print (model.coefficients)


Root Mean Squared Error: 0.7097603333105387
R2 Coefficient: 0.9596072441274395
-0.7754413494111596
[-0.004683943991255353,0.058183602048936967,0.018839305049923535,-0.1657492393342879,0.30994648444218675,0.8733652890938262,-0.003239842026681372]


In [19]:
#linear regression using the sklearn library
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_squared_error
import pandas as pd

sk_data = pd.read_csv ("cruise_ship_info.csv")

x = sk_data [["Cruise_line", "Age", "Tonnage", "passengers", "length", "cabins", "passenger_density"]].to_numpy ()
y = sk_data ["crew"].to_numpy ()

label_enc = LabelEncoder ()
x[:, 0] = label_enc.fit_transform (x[:, 0])
enc = OneHotEncoder (categorical_features=[0])
#x = enc.fit_transform (x).toarray ()

print (x)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size=0.3, random_state=0)

sk_model = LinearRegression ()
sk_model.fit (x_train, y_train)
sk_pred = sk_model.predict (x_test)
print ("Root Mean Squared Error:", mean_squared_error (y_test, sk_pred))
print ("R2 Coefficient:", sk_model.score (x_test, y_test))
print (sk_model.intercept_)
print (sk_model.coef_)

#plt.scatter (x, y, color="blue")
#plt.plot (x_test, sk_pred, color="red")
#plt.show ()

[[ 1.    0.    0.   ...  5.94  3.55 42.64]
 [ 1.    0.    0.   ...  5.94  3.55 42.64]
 [ 0.    1.    0.   ...  7.22  7.43 31.8 ]
 ...
 [ 0.    0.    0.   ...  4.4   0.74 33.86]
 [ 0.    0.    0.   ...  4.4   0.74 32.04]
 [ 0.    0.    0.   ...  6.17  1.56 47.87]]
Root Mean Squared Error: 0.5418854329744975
R2 Coefficient: 0.9461734159499426
-6241846065567.506
[ 6.24184607e+12  6.24184607e+12  6.24184607e+12  6.24184607e+12
  6.24184607e+12  6.24184607e+12  6.24184607e+12  6.24184607e+12
  6.24184607e+12  6.24184607e+12  6.24184607e+12  6.24184607e+12
  6.24184607e+12  6.24184607e+12  6.24184607e+12  6.24184607e+12
  6.24184607e+12  6.24184607e+12  6.24184607e+12  6.24184607e+12
  1.17187500e-02  2.63671875e-02 -7.39746094e-02  6.37817383e-01
  5.26611328e-01 -5.49316406e-03]
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
