In [0]:
#Linear Regression

In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.stat import Correlation
import pyspark.sql.functions as f
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [0]:
spark = SparkSession.builder.getOrCreate()

In [0]:
from pyspark.sql.types import *
Schema = StructType([
    StructField('YearsExperience', FloatType(), True),
    StructField('Salary', FloatType(), True)
])
df = spark.read.csv("/FileStore/tables/Salary_Data.csv", header=True,schema=Schema)

In [0]:
df.show()

+---------------+-------+
|YearsExperience| Salary|
+---------------+-------+
|            1.1|39343.0|
|            1.3|46205.0|
|            1.5|37731.0|
|            2.0|43525.0|
|            2.2|39891.0|
|            2.9|56642.0|
|            3.0|60150.0|
|            3.2|54445.0|
|            3.2|64445.0|
|            3.7|57189.0|
|            3.9|63218.0|
|            4.0|55794.0|
|            4.0|56957.0|
|            4.1|57081.0|
|            4.5|61111.0|
|            4.9|67938.0|
|            5.1|66029.0|
|            5.3|83088.0|
|            5.9|81363.0|
|            6.0|93940.0|
+---------------+-------+
only showing top 20 rows



In [0]:
df.count()

Out[6]: 30

In [0]:
df.corr("YearsExperience", "Salary")

Out[7]: 0.9782416177478152

In [0]:
assembler = VectorAssembler(inputCols=['YearsExperience'],outputCol='features')

data_set = assembler.transform(df)
data_set = data_set.select(['features','salary'])

In [0]:
train_data,test_data = data_set.randomSplit([0.8,0.2])

In [0]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="features",labelCol='salary')
lrModel = lr.fit(train_data)

In [0]:
test_stats = lrModel.evaluate(test_data)
print(f"RMSE: {test_stats.rootMeanSquaredError}")
print(f"R2: {test_stats.r2}")
print(f"R2: {test_stats.meanSquaredError}")

RMSE: 8222.791461065537
R2: 0.8426626781047604
R2: 67614299.4121723


In [0]:
test_stats.predictions.show()

+--------+--------+------------------+
|features|  salary|        prediction|
+--------+--------+------------------+
|   [4.0]| 56957.0| 63592.77681394428|
|   [6.0]| 93940.0|  82519.0741734728|
|   [9.0]|105582.0|110908.52021276561|
+--------+--------+------------------+



In [0]:
# Multiple linear Regression
df = spark.sql("select * from startups")

In [0]:
df.show()

+---------+--------------+---------------+----------+---------+
|R&D Spend|Administration|Marketing Spend|     State|   Profit|
+---------+--------------+---------------+----------+---------+
| 165349.2|      136897.8|       471784.1|  New York|192261.83|
| 162597.7|      151377.6|      443898.53|California|191792.06|
|153441.52|     101145.55|      407934.53|   Florida|191050.39|
| 144372.4|     118671.85|      383199.62|  New York|182901.98|
|142107.34|      91391.77|       366168.4|   Florida|166187.94|
| 131876.9|      99814.71|      362861.38|  New York|156991.12|
|134615.45|     147198.88|      127716.82|California|156122.52|
|130298.13|     145530.06|       323876.7|   Florida| 155752.6|
|120542.52|     148718.95|      311613.28|  New York|152211.77|
|123334.88|     108679.17|      304981.62|California|149759.95|
|101913.08|     110594.11|      229160.95|   Florida|146121.95|
|100671.96|      91790.61|      249744.55|California| 144259.4|
| 93863.75|     127320.38|      249839.4

In [0]:
df.count()

Out[55]: 50

In [0]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='State', outputCol='State_numeric')
indexer_fitted = indexer.fit(df)
df_indexed = indexer_fitted.transform(df)

In [0]:
indexer_fitted.labels

Out[16]: ['California', 'New York', 'Florida']

In [0]:
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCols=['State_numeric'], outputCols=['State_onehot'])
df_onehot = encoder.fit(df_indexed).transform(df_indexed)

In [0]:
df_onehot.printSchema()

root
 |-- R&D Spend: double (nullable = true)
 |-- Administration: double (nullable = true)
 |-- Marketing Spend: double (nullable = true)
 |-- State: string (nullable = true)
 |-- Profit: double (nullable = true)
 |-- State_numeric: double (nullable = false)
 |-- State_onehot: vector (nullable = true)



In [0]:
from pyspark.ml.functions import vector_to_array
df_col_onehot = df_onehot.select('*', vector_to_array('state_onehot').alias('col_onehot'))
df_col_onehot.show()

+---------+--------------+---------------+----------+---------+-------------+-------------+----------+
|R&D Spend|Administration|Marketing Spend|     State|   Profit|State_numeric| State_onehot|col_onehot|
+---------+--------------+---------------+----------+---------+-------------+-------------+----------+
| 165349.2|      136897.8|       471784.1|  New York|192261.83|          1.0|(2,[1],[1.0])|[0.0, 1.0]|
| 162597.7|     151377.59|      443898.53|California|191792.06|          0.0|(2,[0],[1.0])|[1.0, 0.0]|
|153441.51|     101145.55|      407934.54|   Florida|191050.39|          2.0|    (2,[],[])|[0.0, 0.0]|
|144372.41|     118671.85|      383199.62|  New York|182901.99|          1.0|(2,[1],[1.0])|[0.0, 1.0]|
|142107.34|      91391.77|      366168.42|   Florida|166187.94|          2.0|    (2,[],[])|[0.0, 0.0]|
| 131876.9|      99814.71|      362861.36|  New York|156991.12|          1.0|(2,[1],[1.0])|[0.0, 1.0]|
|134615.46|     147198.87|      127716.82|California|156122.51|          

In [0]:
num_categories = len(df_col_onehot.first()['col_onehot']) 
cols_expanded = [(f.col('col_onehot')[i].alias(f'{indexer_fitted.labels[i]}')) for i in range(num_categories)]
df_cols_onehot = df_col_onehot.select('*', *cols_expanded)
df_cols_onehot.show()

+---------+--------------+---------------+----------+---------+-------------+-------------+----------+----------+--------+
|R&D Spend|Administration|Marketing Spend|     State|   Profit|State_numeric| State_onehot|col_onehot|California|New York|
+---------+--------------+---------------+----------+---------+-------------+-------------+----------+----------+--------+
| 165349.2|      136897.8|       471784.1|  New York|192261.83|          1.0|(2,[1],[1.0])|[0.0, 1.0]|       0.0|     1.0|
| 162597.7|     151377.59|      443898.53|California|191792.06|          0.0|(2,[0],[1.0])|[1.0, 0.0]|       1.0|     0.0|
|153441.51|     101145.55|      407934.54|   Florida|191050.39|          2.0|    (2,[],[])|[0.0, 0.0]|       0.0|     0.0|
|144372.41|     118671.85|      383199.62|  New York|182901.99|          1.0|(2,[1],[1.0])|[0.0, 1.0]|       0.0|     1.0|
|142107.34|      91391.77|      366168.42|   Florida|166187.94|          2.0|    (2,[],[])|[0.0, 0.0]|       0.0|     0.0|
| 131876.9|     

In [0]:
df_final = df_cols_onehot.select("R&D Spend", "Administration", "Marketing Spend", "California", "New York", "profit")

In [0]:
assembler = VectorAssembler(inputCols=df_final.columns[:-1],outputCol='features')

data_set = assembler.transform(df_final)
data_set = data_set.select(['features','profit'])

In [0]:
data_set.show()

+--------------------+---------+
|            features|   profit|
+--------------------+---------+
|[165349.2,136897....|192261.83|
|[162597.7,151377....|191792.06|
|[153441.51,101145...|191050.39|
|[144372.41,118671...|182901.99|
|[142107.34,91391....|166187.94|
|[131876.9,99814.7...|156991.12|
|[134615.46,147198...|156122.51|
|[130298.13,145530...| 155752.6|
|[120542.52,148718...|152211.77|
|[123334.88,108679...|149759.96|
|[101913.08,110594...|146121.95|
|[100671.96,91790....| 144259.4|
|[93863.75,127320....|141585.52|
|[91992.39,135495....|134307.35|
|[119943.24,156547...|132602.65|
|[114523.61,122616...|129917.04|
|[78013.11,121597....|126992.93|
|[94657.16,145077....|125370.37|
|[91749.16,114175....| 124266.9|
|[86419.7,153514.1...|122776.86|
+--------------------+---------+
only showing top 20 rows



In [0]:
train_data,test_data = data_set.randomSplit([0.8,0.2])

In [0]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features",labelCol='profit')
lrModel = lr.fit(train_data)

In [0]:
test_stats = lrModel.evaluate(test_data)
print(f"RMSE: {test_stats.rootMeanSquaredError}")
print(f"R2: {test_stats.r2}")
print(f"R2: {test_stats.meanSquaredError}")

RMSE: 7039.210737269694
R2: 0.9487198335918404
R2: 49550487.803692944


In [0]:
test_stats.predictions.show()

+--------------------+---------+------------------+
|            features|   profit|        prediction|
+--------------------+---------+------------------+
|(5,[1,3],[135426....| 42559.73| 45593.74683458533|
|[1000.23,124153.0...| 64926.08| 45222.01805233893|
|[38558.51,82982.0...| 81005.76| 84104.37536252372|
|[61994.48,115641....| 99937.59| 99417.96013956316|
|[66051.52,182645....|103282.38| 101393.2081528368|
|[73994.56,122782....|110352.25|115523.53277285545|
|[76253.86,113867....|118474.03|117632.24678116164|
|[91749.16,114175....| 124266.9|130001.89428330632|
|[94657.16,145077....|125370.37|129505.30437753489|
|[134615.46,147198...|156122.51| 158800.2989726138|
+--------------------+---------+------------------+



In [0]:
# Decision Tree Regression

df = spark.sql("select * from position_salaries")

In [0]:
df.show()

+-----------------+-----+-------+
|         Position|Level| Salary|
+-----------------+-----+-------+
| Business Analyst|    1|  45000|
|Junior Consultant|    2|  50000|
|Senior Consultant|    3|  60000|
|          Manager|    4|  80000|
|  Country Manager|    5| 110000|
|   Region Manager|    6| 150000|
|          Partner|    7| 200000|
|   Senior Partner|    8| 300000|
|          C-level|    9| 500000|
|              CEO|   10|1000000|
+-----------------+-----+-------+



In [0]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Position', outputCol='Position_numeric')
indexer_fitted = indexer.fit(df)
df_indexed = indexer_fitted.transform(df)

In [0]:
df_indexed.columns

Out[35]: ['Position', 'Level', 'Salary', 'Position_numeric']

In [0]:
assembler = VectorAssembler(inputCols=[ 'Level', 'Position_numeric'],outputCol='features')
data_set = assembler.transform(df_indexed)
data_set = data_set.select(['features','Salary'])

In [0]:
data_set.show()

+----------+-------+
|  features| Salary|
+----------+-------+
| [1.0,0.0]|  45000|
| [2.0,4.0]|  50000|
| [3.0,8.0]|  60000|
| [4.0,5.0]|  80000|
| [5.0,3.0]| 110000|
| [6.0,7.0]| 150000|
| [7.0,6.0]| 200000|
| [8.0,9.0]| 300000|
| [9.0,1.0]| 500000|
|[10.0,2.0]|1000000|
+----------+-------+



In [0]:
from pyspark.ml.regression import DecisionTreeRegressor
dr = DecisionTreeRegressor(featuresCol="features",labelCol='Salary')
drModel = dr.fit(data_set)

In [0]:
# random forest regression
# The inputs are as follows
# X1=the transaction date (for example, 2013.250=2013 March, 2013.500=2013 June, etc.)
# X2=the house age (unit: year)
# X3=the distance to the nearest MRT station (unit: meter)
# X4=the number of convenience stores in the living circle on foot (integer)
# X5=the geographic coordinate, latitude. (unit: degree)
# X6=the geographic coordinate, longitude. (unit: degree)

# The output is as follow
# Y= house price of unit area (10000 New Taiwan Dollar/Ping, where Ping is a local unit, 1 Ping = 3.3 meter squared)


df = spark.sql("select * from real_estate")

In [0]:
df.show()

+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
| No|X1 transaction date|X2 house age|X3 distance to the nearest MRT station|X4 number of convenience stores|X5 latitude|X6 longitude|Y house price of unit area|
+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
|  1|           2012.917|        32.0|                              84.87882|                             10|   24.98298|   121.54024|                      37.9|
|  2|           2012.917|        19.5|                              306.5947|                              9|   24.98034|   121.53951|                      42.2|
|  3|           2013.583|        13.3|                              561.9845|                              5|   24.98746|   121.54391|                      47.3|
|  4|             2013.5|   

In [0]:
assembler = VectorAssembler(inputCols=df.columns[1:-1],outputCol='features')

data_set = assembler.transform(df)
data_set = data_set.select(['features','Y house price of unit area'])

In [0]:
data_set.show()

+--------------------+--------------------------+
|            features|Y house price of unit area|
+--------------------+--------------------------+
|[2012.917,32.0,84...|                      37.9|
|[2012.917,19.5,30...|                      42.2|
|[2013.583,13.3,56...|                      47.3|
|[2013.5,13.3,561....|                      54.8|
|[2012.833,5.0,390...|                      43.1|
|[2012.667,7.1,217...|                      32.1|
|[2012.667,34.5,62...|                      40.3|
|[2013.417,20.3,28...|                      46.7|
|[2013.5,31.7,5512...|                      18.8|
|[2013.417,17.9,17...|                      22.1|
|[2013.083,34.8,40...|                      41.4|
|[2013.333,6.3,90....|                      58.1|
|[2012.917,13.0,49...|                      39.3|
|[2012.667,20.4,24...|                      23.8|
|[2013.5,13.2,1164...|                      34.3|
|[2013.583,35.7,57...|                      50.5|
|[2013.25,0.0,292....|                      70.1|


In [0]:
train_data,test_data = data_set.randomSplit([0.8,0.2])

In [0]:
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(featuresCol="features",labelCol='Y house price of unit area')
rfModel = rf.fit(train_data)

In [0]:
test_stats = rfModel.transform(test_data)

In [0]:
test_stats.show()

+--------------------+--------------------------+------------------+
|            features|Y house price of unit area|        prediction|
+--------------------+--------------------------+------------------+
|[2012.667,5.6,90....|                      50.0| 53.60582805697392|
|[2012.667,7.1,217...|                      32.1| 27.75672864790375|
|[2012.667,12.6,38...|                      42.5| 43.35049452910742|
|[2012.667,15.6,28...|                      46.1|  48.5832323142672|
|[2012.667,20.2,21...|                      22.8|26.886511332635063|
|[2012.667,20.4,24...|                      23.8|27.022982701834003|
|[2012.667,26.9,44...|                      15.5| 17.30897624527933|
|[2012.667,32.7,39...|                      30.5|36.459119386231635|
|[2012.667,33.4,18...|                      42.2| 44.42771937871716|
|[2012.75,12.5,114...|                      34.1|32.416878022491346|
|[2012.75,13.0,492...|                      40.5| 41.80630738710609|
|[2012.75,14.2,180...|            

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(
    labelCol="Y house price of unit area", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(test_stats)

In [0]:
rmse

Out[18]: 7.484506897505033