In [37]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
spark = SparkSession.builder.appName("Linear_Regression").getOrCreate()
df = spark.read.csv("cruise_ship_info.csv" , inferSchema=True , header = True)
df.createOrReplaceTempView("cruise")
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [38]:
df.show(5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
only showing top 5 rows



In [40]:
spark.sql("select count(distinct(Cruise_line)) from cruise").show()
df.groupBy("Cruise_line").count().show()

+---------------------------+
|count(DISTINCT Cruise_line)|
+---------------------------+
|                         20|
+---------------------------+

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



In [5]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer

In [7]:
string_indexer  = StringIndexer(inputCol="Cruise_line" ,outputCol="indexed")
model = string_indexer.fit(df)
temp_df = model.transform(df)

In [8]:
temp_df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)
 |-- indexed: double (nullable = false)



In [9]:
temp_df.select("indexed").show(5)

+-------+
|indexed|
+-------+
|   16.0|
|   16.0|
|    1.0|
|    1.0|
|    1.0|
+-------+
only showing top 5 rows



In [12]:
temp_df.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'indexed']

In [16]:
assembler = VectorAssembler(inputCols=['Age','Tonnage','passengers','length','cabins','passenger_density','indexed'] , outputCol="features")

In [17]:
output = assembler.transform(temp_df)

In [18]:
output.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)
 |-- indexed: double (nullable = false)
 |-- features: vector (nullable = true)



In [20]:
final = output.select("features" , "crew")

In [21]:
train_data , test_data = final.randomSplit([0.7,0.3])

In [23]:
from doctest import testfile


train_data.describe().show()
test_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                96|
|   mean| 7.883125000000008|
| stddev|3.3262991328280482|
|    min|               0.6|
|    max|              21.0|
+-------+------------------+

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|               62|
|   mean|7.656451612903226|
| stddev|3.785309869627018|
|    min|             0.59|
|    max|             19.1|
+-------+-----------------+



In [24]:
linear = LinearRegression(labelCol="crew")
linear_model = linear.fit(train_data)

22/09/10 17:43:23 WARN Instrumentation: [f325c6c2] regParam is zero, which might cause numerical instability and overfitting.
22/09/10 17:43:23 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/09/10 17:43:23 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/09/10 17:43:24 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [28]:
test_result = linear_model.evaluate(test_data)
test_result.residuals.show(5)
test_data.select("crew").show(5)

+-------------------+
|          residuals|
+-------------------+
|-1.2681807756344714|
| 0.8528401244150201|
|0.21701897273838355|
|-0.7077666956194895|
| 0.5291426498031377|
+-------------------+
only showing top 5 rows

+-----+
| crew|
+-----+
|  8.0|
|  6.7|
|13.13|
| 3.55|
|11.09|
+-----+
only showing top 5 rows



In [32]:
print(test_result.meanAbsoluteError)
print(test_result.rootMeanSquaredError)
print(test_result.r2)

0.6908210247091862
1.2030615374030291
0.8973321032961441
