In [126]:
from pyspark.sql import SparkSession

In [127]:
from pyspark.ml.regression import LinearRegression

In [128]:
from pyspark.ml.linalg import Vector

In [129]:
from pyspark.ml.feature import VectorAssembler,StringIndexer,OneHotEncoder

In [130]:
spark =SparkSession.builder.appName('Cruis ship project').getOrCreate()

In [131]:
data = spark.read.csv('cruise_ship_info.csv',header=True,inferSchema=True)

In [132]:
data.take(5)[4].asDict()

{'Ship_name': 'Destiny',
 'Cruise_line': 'Carnival',
 'Age': 17,
 'Tonnage': 101.353,
 'passengers': 26.42,
 'length': 8.92,
 'cabins': 13.21,
 'passenger_density': 38.36,
 'crew': 10.0}

In [133]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [134]:
data.groupBy('Cruise_line').count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



In [135]:
indexer =StringIndexer(inputCol='Cruise_line',outputCol='Cruiseline_cat')

In [136]:
indexed= indexer.fit(data).transform(data)

In [137]:
indexed.show(10)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruiseline_cat|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|          16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|          16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|           1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|           1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|           1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|        

In [138]:
#encoder = OneHotEncoder(dropLast=True,inputCol='Cruiseline_cat',outputCol='CL_CAT')

In [139]:
#encoded = encoder.transform(indexed)

In [140]:
#encoded.columns

In [141]:
assembler = VectorAssembler(inputCols=['Age','Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'Cruiseline_cat'],outputCol='features')
featcols=['Age','Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'Cruiseline_cat']

In [142]:
output = assembler.transform(indexed)

In [143]:
output.take(10)[9]

Row(Ship_name='Freedom', Cruise_line='Carnival', Age=6, Tonnage=110.23899999999999, passengers=37.0, length=9.51, cabins=14.87, passenger_density=29.79, crew=11.5, Cruiseline_cat=1.0, features=DenseVector([6.0, 110.239, 37.0, 9.51, 14.87, 29.79, 1.0]))

In [144]:
final = output.select(['crew','features'])

In [145]:
final.show()

+----+--------------------+
|crew|            features|
+----+--------------------+
|3.55|[6.0,30.276999999...|
|3.55|[6.0,30.276999999...|
| 6.7|[26.0,47.262,14.8...|
|19.1|[11.0,110.0,29.74...|
|10.0|[17.0,101.353,26....|
| 9.2|[22.0,70.367,20.5...|
| 9.2|[15.0,70.367,20.5...|
| 9.2|[23.0,70.367,20.5...|
| 9.2|[19.0,70.367,20.5...|
|11.5|[6.0,110.23899999...|
|11.6|[10.0,110.0,29.74...|
| 6.6|[28.0,46.052,14.5...|
| 9.2|[18.0,70.367,20.5...|
| 9.2|[17.0,70.367,20.5...|
| 9.3|[11.0,86.0,21.24,...|
|11.6|[8.0,110.0,29.74,...|
|10.3|[9.0,88.5,21.24,9...|
| 9.2|[15.0,70.367,20.5...|
| 9.3|[12.0,88.5,21.24,...|
| 9.2|[20.0,70.367,20.5...|
+----+--------------------+
only showing top 20 rows



In [146]:
train_data,test_data = final.randomSplit([0.7,0.3])

In [147]:
regressor = LinearRegression(featuresCol='features',labelCol='crew',predictionCol='predicted crews')

In [148]:
model = regressor.fit(train_data)

In [149]:
eval_model = model.evaluate(test_data)

In [150]:
prediction = model.transform(test_data.select('features'))

In [151]:
prediction.show()

+--------------------+------------------+
|            features|   predicted crews|
+--------------------+------------------+
|[25.0,5.35,1.58,4...| 1.512875655776002|
|[24.0,10.0,2.08,4...|1.6431300910794218|
|[13.0,25.0,3.82,5...|2.9725823074259043|
|[36.0,16.852,9.52...|  3.01058701272943|
|[40.0,28.0,11.5,6...|3.1805870591880874|
|[23.0,25.0,7.76,6...| 3.807834707551601|
|[13.0,30.27699999...| 4.014367633216378|
|[15.0,30.27699999...|3.9683750641340767|
|[21.0,28.43,8.08,...| 4.006523459034641|
|[21.0,38.0,10.56,...| 4.505215746416018|
|[12.0,50.0,7.0,7....|  4.47659414358952|
|[25.0,38.0,7.49,6...|3.9962083105254624|
|[31.0,35.143,12.5...| 4.586586719906049|
|[18.0,51.004,9.4,...| 5.884923665381264|
|[27.0,53.872,14.9...| 6.626865370024243|
|[20.0,50.76,17.48...|  7.43101407482933|
|[10.0,68.0,10.8,7...|   6.5941808777881|
|[16.0,59.652,13.2...| 6.278780234372106|
|[15.0,78.491,24.3...| 8.226977179947792|
|[12.0,42.0,14.8,7...|6.8098314759568765|
+--------------------+------------

In [152]:
eval_model.rootMeanSquaredError

0.9401269278319621

In [153]:
eval_model.r2

0.9155232635112203

In [154]:
from pyspark.sql.functions import corr

In [155]:
data.select(corr('crew','passengers')).show()

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+



In [156]:
coeff=model.coefficients

In [157]:
a= zip(featcols,coeff)

In [158]:
b= set(a)

In [159]:
for x,y in b:
    print(x ,':' ,y)
    print('\n')

passenger_density : -0.004629238626027429


Age : -0.022996284541150457


Cruiseline_cat : 0.051566646504548584


passengers : -0.15272123250400957


cabins : 0.8981671804572302


Tonnage : 0.006787748955648134


length : 0.392104778438069




In [160]:
data.select(corr('crew','length')).show()

+------------------+
|corr(crew, length)|
+------------------+
|0.8958566271016579|
+------------------+



In [161]:
data.select(corr('crew','passengers')).show()

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+

