In [90]:
from pyspark.sql import SparkSession

In [91]:
from pyspark.ml.regression import LinearRegression

In [92]:
from pyspark.ml.linalg import Vector

In [93]:
from pyspark.ml.feature import VectorAssembler,StringIndexer,OneHotEncoder

In [94]:
spark =SparkSession.builder.appName('Cruis ship project').getOrCreate()

In [95]:
data = spark.read.csv('cruise_ship_info.csv',header=True,inferSchema=True)

In [96]:
data.take(5)[4].asDict()

{'Ship_name': 'Destiny',
 'Cruise_line': 'Carnival',
 'Age': 17,
 'Tonnage': 101.353,
 'passengers': 26.42,
 'length': 8.92,
 'cabins': 13.21,
 'passenger_density': 38.36,
 'crew': 10.0}

In [97]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [98]:
data.groupBy('Cruise_line').count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



In [99]:
indexer =StringIndexer(inputCol='Cruise_line',outputCol='Cruiseline_cat')

In [100]:
indexed= indexer.fit(data).transform(data)

In [101]:
indexed.show(10)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruiseline_cat|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|          16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|          16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|           1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|           1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|           1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|        

In [102]:
encoder = OneHotEncoder(dropLast=True,inputCol='Cruiseline_cat',outputCol='CL_CAT')

In [103]:
encoded = encoder.transform(indexed)

In [104]:
encoded.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Cruiseline_cat',
 'CL_CAT']

In [105]:
assembler = VectorAssembler(inputCols=['Age','Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'CL_CAT'],outputCol='features')
featcols=['Age','Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'CL_CAT']

In [106]:
output = assembler.transform(encoded)

In [107]:
output.take(10)[9]

Row(Ship_name='Freedom', Cruise_line='Carnival', Age=6, Tonnage=110.23899999999999, passengers=37.0, length=9.51, cabins=14.87, passenger_density=29.79, crew=11.5, Cruiseline_cat=1.0, CL_CAT=SparseVector(19, {1: 1.0}), features=SparseVector(25, {0: 6.0, 1: 110.239, 2: 37.0, 3: 9.51, 4: 14.87, 5: 29.79, 7: 1.0}))

In [108]:
final = output.select(['crew','features'])

In [109]:
final.show()

+----+--------------------+
|crew|            features|
+----+--------------------+
|3.55|(25,[0,1,2,3,4,5,...|
|3.55|(25,[0,1,2,3,4,5,...|
| 6.7|(25,[0,1,2,3,4,5,...|
|19.1|(25,[0,1,2,3,4,5,...|
|10.0|(25,[0,1,2,3,4,5,...|
| 9.2|(25,[0,1,2,3,4,5,...|
| 9.2|(25,[0,1,2,3,4,5,...|
| 9.2|(25,[0,1,2,3,4,5,...|
| 9.2|(25,[0,1,2,3,4,5,...|
|11.5|(25,[0,1,2,3,4,5,...|
|11.6|(25,[0,1,2,3,4,5,...|
| 6.6|(25,[0,1,2,3,4,5,...|
| 9.2|(25,[0,1,2,3,4,5,...|
| 9.2|(25,[0,1,2,3,4,5,...|
| 9.3|(25,[0,1,2,3,4,5,...|
|11.6|(25,[0,1,2,3,4,5,...|
|10.3|(25,[0,1,2,3,4,5,...|
| 9.2|(25,[0,1,2,3,4,5,...|
| 9.3|(25,[0,1,2,3,4,5,...|
| 9.2|(25,[0,1,2,3,4,5,...|
+----+--------------------+
only showing top 20 rows



In [110]:
train_data,test_data = final.randomSplit([0.7,0.3])

In [111]:
regressor = LinearRegression(featuresCol='features',labelCol='crew',predictionCol='predicted crews')

In [112]:
model = regressor.fit(train_data)

In [113]:
eval_model = model.evaluate(test_data)

In [114]:
prediction = model.transform(test_data.select('features'))

In [115]:
prediction.show()

+--------------------+------------------+
|            features|   predicted crews|
+--------------------+------------------+
|(25,[0,1,2,3,4,5,...|1.4174562147377325|
|(25,[0,1,2,3,4,5,...|1.0599381623319952|
|(25,[0,1,2,3,4,5,...|1.5604449547307633|
|(25,[0,1,2,3,4,5,...| 2.326415650838013|
|(25,[0,1,2,3,4,5,...|2.1625904357126475|
|(25,[0,1,2,3,4,5,...|2.8656178331845497|
|(25,[0,1,2,3,4,5,...| 3.550000000000001|
|(25,[0,1,2,3,4,5,...|3.5559719845312414|
|(25,[0,1,2,3,4,5,...| 4.017580020119661|
|(25,[0,1,2,3,4,5,...| 4.017580020119661|
|(25,[0,1,2,3,4,5,...| 2.921491430150605|
|(25,[0,1,2,3,4,5,...| 4.673873617438636|
|(25,[0,1,2,3,4,5,...| 5.227718612381922|
|(25,[0,1,2,3,4,5,...| 4.627750263973965|
|(25,[0,1,2,3,4,5,...| 5.362192731895041|
|(25,[0,1,2,3,4,5,...| 5.341416469294467|
|(25,[0,1,2,3,4,5,...| 6.398119745999635|
|(25,[0,1,2,3,4,5,...| 7.476709374472044|
|(25,[0,1,2,3,4,5,...| 6.487301015979057|
|(25,[0,1,2,3,4,5,...| 6.048709792805653|
+--------------------+------------

In [116]:
eval_model.rootMeanSquaredError

0.7990743314673693

In [163]:
eval_model.meanSquaredError

0.8838386404347632

In [117]:
eval_model.r2

0.9323424824110355

In [118]:
from pyspark.sql.functions import corr

In [119]:
data.select(corr('crew','passengers')).show()

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+



In [120]:
coeff=model.coefficients

In [121]:
a= zip(featcols,coeff)

In [122]:
b= set(a)

In [123]:
for x,y in b:
    print(x ,':' ,y.format())
    print('\n')

Age : 0.008790010059830093


cabins : 0.7864884309718524


passengers : -0.08681596249436126


Tonnage : 0.00792097193644206


length : 0.4326421461199396


CL_CAT : -0.7417741355981805


passenger_density : 0.0076399640130505545




In [124]:
data.select(corr('crew','length')).show()

+------------------+
|corr(crew, length)|
+------------------+
|0.8958566271016579|
+------------------+



In [125]:
data.select(corr('crew','passengers')).show()

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+

