## Linear regression with pyspark

In [1]:
import pandas as pd

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Pyspark course") \
    .getOrCreate()

In [2]:
from sklearn.datasets import load_boston
data = load_boston()

all_cols = list(data['feature_names'])

In [3]:
data_df = pd.DataFrame(data.data, columns=data['feature_names'])
data_df['label'] = data.target

In [4]:
data_sdf = spark.createDataFrame(data_df)

In [5]:
data_sdf.show()

+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|label|
+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+
|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2|  4.09|1.0|296.0|   15.3| 396.9| 4.98| 24.0|
|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0|   17.8| 396.9| 9.14| 21.6|
|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0|   17.8|392.83| 4.03| 34.7|
|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0|   18.7|394.63| 2.94| 33.4|
|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0|   18.7| 396.9| 5.33| 36.2|
|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0|   18.7|394.12| 5.21| 28.7|
|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0|   15.2| 395.6|12.43| 22.9|
|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|5.0|311.0|   15.2| 396.9|19.15| 27.1|
|0.21124|12.5| 7.87| 0.0|0.524|5

In [6]:
from pyspark.ml.feature import VectorAssembler


assembler = VectorAssembler(
    inputCols=['CRIM', 'LSTAT'],
    outputCol="features")

data = assembler.transform(data_sdf)

In [7]:
data.printSchema()

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: double (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: double (nullable = true)
 |-- TAX: double (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [8]:
train, test = data.randomSplit([0.9, 0.1], seed=12345)

In [9]:
from pyspark.ml.regression import LinearRegression

reg = LinearRegression(maxIter=100)

In [10]:
model = reg.fit(train)

In [11]:
test_output = model.transform(test)
test_output.show()

+-------+----+-----+----+------+-----+----+------+---+-----+-------+------+-----+-----+---------------+------------------+
|   CRIM|  ZN|INDUS|CHAS|   NOX|   RM| AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|label|       features|        prediction|
+-------+----+-----+----+------+-----+----+------+---+-----+-------+------+-----+-----+---------------+------------------+
|0.01951|17.5| 1.38| 0.0|0.4161|7.104|59.5|9.2229|3.0|216.0|   18.6|393.24| 8.05| 33.0| [0.01951,8.05]|26.760799675492613|
|0.02763|75.0| 2.95| 0.0| 0.428|6.595|21.8|5.4011|3.0|252.0|   18.3|395.63| 4.32| 30.8| [0.02763,4.32]|30.091008048495993|
|0.09744| 0.0| 5.96| 0.0| 0.499|5.841|61.4|3.3779|5.0|279.0|   19.2|377.56|11.41| 20.0|[0.09744,11.41]|23.756232907479905|
|0.12744| 0.0| 6.91| 0.0| 0.448| 6.77| 2.9|5.7209|3.0|233.0|   17.9|385.41| 4.84| 26.6| [0.12744,4.84]|29.621176661541586|
|0.13262| 0.0| 8.56| 0.0|  0.52|5.851|96.7|2.1069|5.0|384.0|   20.9|394.05|16.47| 19.5|[0.13262,16.47]|19.236030155124013|
| 0.1396| 0.0| 8

## Exercise 

Calculate MSE

In [12]:
import pyspark.sql.functions as F
test_output.select(
    F.avg(F.pow(F.col("prediction") - F.col("label"), 2)).alias('mse')
).select("mse", F.sqrt("mse")).collect()

[Row(mse=53.537863568477526, SQRT(mse)=7.316957261627098)]

## Pipelines

In [14]:
train, test = data_sdf.randomSplit([0.8, 0.2], seed=666)

In [15]:
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler

class HouseValueModel:
    
    def __init__(self, reg, reg_model, inputCols=['CRIM', 'LSTAT']):
        self.assembler = VectorAssembler(
            inputCols=inputCols,
            outputCol="features")
        self.regression =  reg
        self.reg_model = reg_model
        self.model = None
        
    def fit(self, sdf):
        sdf_with_features = self.assembler.transform(sdf)
        self.model = self.regression.fit(sdf_with_features)
        
    def transform(self, sdf):
        sdf_with_features = self.assembler.transform(sdf)
        return self.model.transform(sdf_with_features)
    
    def transform_and_calculate_mse(self, sdf):
        prediction_sdf = self.transform(sdf)
        return mse(prediction_sdf)
    
    def save(self, file_name):
        self.model.save(file_name)
        
    def load(self, file_name):
        self.model = self.reg_model.load(file_name)

def mse(sdf):
    return sdf.select(
        F.avg(F.pow(F.col("prediction") - F.col("label"), 2)).alias('mse')
    )



In [16]:
from pyspark.ml.regression import LinearRegression, LinearRegressionModel
house_value_model = HouseValueModel(
    reg=LinearRegression(maxIter=100),
    reg_model = LinearRegressionModel()
)
house_value_model.fit(train)
house_value_model.transform_and_calculate_mse(test).show()

+-----------------+
|              mse|
+-----------------+
|29.39492665040708|
+-----------------+



In [17]:
house_value_model.save("house_linear_model5")

In [19]:
loaded_house_value_model = HouseValueModel(
    reg=LinearRegression(maxIter=100),
    reg_model = LinearRegressionModel()
)
loaded_house_value_model.load("house_linear_model5")
loaded_house_value_model.transform_and_calculate_mse(test).show()

+-----------------+
|              mse|
+-----------------+
|29.39492665040708|
+-----------------+

