In [75]:
# Imports
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [3]:
spark = SparkSession.builder.appName("LinearRegression").getOrCreate()

In [5]:
# Get the training data
training = spark.read.format("libsvm").load('sample_linear_regression_data.txt')

In [6]:
training.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [7]:
training.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [10]:
# Create LR object
lr = LinearRegression(featuresCol='features', labelCol='label', predictionCol='prediction')

In [12]:
# Creating Linear Regression model on the training data
lr_model = lr.fit(training)

In [15]:
print("Coefficients are: %s\n"%str(lr_model.coefficients))
print("Intercept is: %s"%str(lr_model.intercept))

Coefficients are: [0.00733507102258,0.831375758434,-0.809530795468,2.44119168688,0.519171379529,1.15345919035,-0.298912411281,-0.51285141862,-0.619712827067,0.695615180432]

Intercept is: 0.14228558260358093


In [18]:
print("RMSE is: %s\n"%str(lr_model.summary.rootMeanSquaredError))
print("R2 is: %s"%str(lr_model.summary.r2))


RMSE is: 10.16309157133015

R2 is: 0.027839179518600154


### Train/ Test split

In [20]:
# Get the training data
all_data = spark.read.format("libsvm").load('sample_linear_regression_data.txt')

In [21]:
train_data, test_data = all_data.randomSplit([0.7, 0.3])

In [22]:
train_data.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
|-28.571478869743427|(10,[0,1,2,3,4,5,...|
|-28.046018037776633|(10,[0,1,2,3,4,5,...|
| -23.51088409032297|(10,[0,1,2,3,4,5,...|
|-23.487440120936512|(10,[0,1,2,3,4,5,...|
|-22.949825936196074|(10,[0,1,2,3,4,5,...|
|-22.837460416919342|(10,[0,1,2,3,4,5,...|
|-21.432387764165806|(10,[0,1,2,3,4,5,...|
|-20.212077258958672|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-19.884560774273424|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -19.66731861537172|(10,[0,1,2,3,4,5,...|
|-19.402336030214553|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|-18.845922472898582|(10,[0,1,2,3,4,5,...|
|-17.803626188664516|(10,[0,1,2,3,4,5,...|
|-17.494200356883344|(10,[0,1,2,3,4,5,...|
|-17.428674570939506|(10,[0,1,2,3,4,5,...|
|-17.026492264209548|(10,[0,1,2,3,4,5,...|
|-16.692207021311106|(10,[0,1,2,3,4,5,...|
+----------

In [23]:
test_data.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
|-26.805483428483072|(10,[0,1,2,3,4,5,...|
|-26.736207182601724|(10,[0,1,2,3,4,5,...|
|-19.872991038068406|(10,[0,1,2,3,4,5,...|
| -18.27521356600463|(10,[0,1,2,3,4,5,...|
| -17.32672073267595|(10,[0,1,2,3,4,5,...|
|-17.065399625876015|(10,[0,1,2,3,4,5,...|
| -16.71909683360509|(10,[0,1,2,3,4,5,...|
|-15.951512565794573|(10,[0,1,2,3,4,5,...|
| -15.86200932757056|(10,[0,1,2,3,4,5,...|
|-15.780685032623301|(10,[0,1,2,3,4,5,...|
|-15.732088272239245|(10,[0,1,2,3,4,5,...|
|-15.348871155379253|(10,[0,1,2,3,4,5,...|
|-15.334767479922341|(10,[0,1,2,3,4,5,...|
|-14.328978509075442|(10,[0,1,2,3,4,5,...|
|-13.976130931152703|(10,[0,1,2,3,4,5,...|
|-12.977848725392104|(10,[0,1,2,3,4,5,...|
|-12.558575788856189|(10,[0,1,2,3,4,5,...|
|-12.491442077546413|(10,[0,1,2,3,4,5,...|
|-12.479280211451497|(10,[0,1,2,3,4,5,...|
| -12.46765638103286|(10,[0,1,2,3,4,5,...|
+----------

In [24]:
# Train the model on train_data
lr = LinearRegression(featuresCol='features', labelCol='label', predictionCol='prediction')
lr_model = lr.fit(train_data)

In [25]:
print("RMSE is: %s\n"%str(lr_model.summary.rootMeanSquaredError))
print("R2 is: %s"%str(lr_model.summary.r2))

RMSE is: 10.09961712270764

R2 is: 0.04238021457950125


In [26]:
# Evaluate on test data
test_result = lr_model.evaluate(test_data)

In [28]:
test_result.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|-28.051490229217293|
| -23.56409050662954|
|-22.074796896236247|
|-20.481691818985247|
|-17.751375775783117|
|-18.460524892808596|
|-17.344123912249195|
|-15.408851476768293|
|-18.485265851988178|
|-17.709260359095623|
|-16.507582799162066|
|-13.355485312400399|
| -18.11581149670668|
| -16.62289509911032|
|-12.529668672476248|
|-16.254468073296827|
|-13.188908570089346|
|-15.666659171225405|
| -9.729930604278342|
|-10.829400647562075|
+-------------------+
only showing top 20 rows



In [29]:
# Treat test as unseen
unlabeled_data = test_data.select('features')

In [30]:
predictions = lr_model.transform(unlabeled_data)

In [31]:
predictions.show()

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|(10,[0,1,2,3,4,5,...| 1.2460068007342224|
|(10,[0,1,2,3,4,5,...|-3.1721166759721835|
|(10,[0,1,2,3,4,5,...|  2.201805858167842|
|(10,[0,1,2,3,4,5,...| 2.2064782529806157|
|(10,[0,1,2,3,4,5,...|0.42465504310716856|
|(10,[0,1,2,3,4,5,...|  1.395125266932581|
|(10,[0,1,2,3,4,5,...| 0.6250270786441041|
|(10,[0,1,2,3,4,5,...|  -0.54266108902628|
|(10,[0,1,2,3,4,5,...|  2.623256524417618|
|(10,[0,1,2,3,4,5,...| 1.9285753264723222|
|(10,[0,1,2,3,4,5,...| 0.7754945269228202|
|(10,[0,1,2,3,4,5,...|-1.9933858429788551|
|(10,[0,1,2,3,4,5,...|   2.78104401678434|
|(10,[0,1,2,3,4,5,...| 2.2939165900348772|
|(10,[0,1,2,3,4,5,...|-1.4464622586764548|
|(10,[0,1,2,3,4,5,...| 3.2766193479047243|
|(10,[0,1,2,3,4,5,...| 0.6303327812331578|
|(10,[0,1,2,3,4,5,...| 3.1752170936789925|
|(10,[0,1,2,3,4,5,...| -2.749349607173155|
|(10,[0,1,2,3,4,5,...|-1.6382557334707852|
+----------

### E-Commerce Dataset

In [32]:
all_data = spark.read.csv('Ecommerce_Customers.csv', inferSchema=True, header=True)

In [33]:
all_data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [36]:
all_data.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

### Creating dataframe fron input CSV

In [38]:
all_data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [41]:
assembler = VectorAssembler(inputCols=['Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership'], outputCol='features')

In [43]:
output = assembler.transform(all_data)

In [45]:
output.select('features').show()

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



In [46]:
final_data = output.select(['features','Yearly Amount Spent'])

In [47]:
train, test = final_data.randomSplit([0.7, 0.3])

In [35]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [49]:
# Fit training data to this object
lr_model = lr.fit(train,)

In [50]:
print("RMSE is: %s\n"%str(lr_model.summary.rootMeanSquaredError))
print("R2 is: %s"%str(lr_model.summary.r2))

RMSE is: 9.997961813915362

R2 is: 0.9842522112280273


In [51]:
test_results = lr_model.evaluate(test)

In [53]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|  6.944751223775938|
| -6.017331798030511|
|  -0.67301703641931|
| 3.1542013225556502|
| -4.023708265643791|
|-3.9775651490506903|
| -8.867770688272628|
| -4.418878750475471|
|  6.663145602067232|
|-0.7501846449787308|
|-17.809135671944944|
| -2.032208847665629|
| -8.853548241649776|
|   8.38101009601371|
|  5.517121975295822|
| 3.6355931200027385|
|  17.72379388004424|
|  4.636051495043262|
|  5.433876368525091|
| -6.168006418033428|
+-------------------+
only showing top 20 rows



### Hyundai Dataset

In [55]:
data = spark.read.csv("cruise_ship_info.csv", inferSchema=True, header=True)
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [77]:
# Convert string categories to numerical
indexer = StringIndexer(inputCol="Cruise_line", outputCol="cruise_cat")
indexed = indexer.fit(data).transform(data)
indexed.head(5)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0),
 Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0),
 Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262, passengers=14.86, length=7.22, cabins=7.43, passenger_density=31.8, crew=6.7, cruise_cat=1.0),
 Row(Ship_name='Conquest', Cruise_line='Carnival', Age=11, Tonnage=110.0, passengers=29.74, length=9.53, cabins=14.88, passenger_density=36.99, crew=19.1, cruise_cat=1.0),
 Row(Ship_name='Destiny', Cruise_line='Carnival', Age=17, Tonnage=101.353, passengers=26.42, length=8.92, cabins=13.21, passenger_density=38.36, crew=10.0, cruise_cat=1.0)]

In [79]:
indexed.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'cruise_cat']

In [80]:
assembler = VectorAssembler(inputCols=['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density', 'cruise_cat'], outputCol='features')

In [81]:
output = assembler.transform(indexed)

In [82]:
output.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+--------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|cruise_cat|            features|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+--------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|[6.0,30.276999999...|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|[6.0,30.276999999...|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|       1.0|[26.0,47.262,14.8...|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|       1.0|[11.0,110.0,29.74...|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0| 

In [83]:
output.head(1)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0, features=DenseVector([6.0, 30.277, 6.94, 5.94, 3.55, 42.64, 16.0]))]

In [84]:
final_data = output.select(['features','crew'])
final_data.show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
|[22.0,70.367,20.5...| 9.2|
|[15.0,70.367,20.5...| 9.2|
|[23.0,70.367,20.5...| 9.2|
|[19.0,70.367,20.5...| 9.2|
|[6.0,110.23899999...|11.5|
|[10.0,110.0,29.74...|11.6|
|[28.0,46.052,14.5...| 6.6|
|[18.0,70.367,20.5...| 9.2|
|[17.0,70.367,20.5...| 9.2|
|[11.0,86.0,21.24,...| 9.3|
|[8.0,110.0,29.74,...|11.6|
|[9.0,88.5,21.24,9...|10.3|
|[15.0,70.367,20.5...| 9.2|
|[12.0,88.5,21.24,...| 9.3|
|[20.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



In [85]:
lr = LinearRegression(labelCol='crew')

In [86]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [87]:
lr_model = lr.fit(train_data)

In [88]:
print("RMSE is: %s\n"%str(lr_model.summary.rootMeanSquaredError))
print("R2 is: %s"%str(lr_model.summary.r2))

RMSE is: 0.9623360555776993

R2 is: 0.9218887435992341


In [89]:
test_results = lr_model.evaluate(test_data)

In [90]:
print("RMSE is: %s\n"%str(test_results.rootMeanSquaredError))
print("R2 is: %s"%str(test_results.r2))

RMSE is: 0.9135097545008926

R2 is: 0.9309680571579058
