# Create entry points to spark

In [1]:
!pip3 install pyspark



In [0]:
from pyspark import SparkContext
sc = SparkContext(master = 'local')

from pyspark.sql import SparkSession
spark = SparkSession.builder \
          .appName("Python Spark regression") \
          .config("spark.some.config.option", "some-value") \
          .getOrCreate()

![alt text](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTx_zyWVo2dnOxqNQVEExl0EGIwAQIq8a3VbATIWucm5I6GVgzKOg)

# Linear regression without cross-valiation

In [4]:
data=spark.read.csv("sample_data/california_housing_train.csv",header=True,inferSchema=True)
data.show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -114.31|   34.19|              15.0|     5612.0|        1283.0|    1015.0|     472.0|       1.4936|           66900.0|
|  -114.47|    34.4|              19.0|     7650.0|        1901.0|    1129.0|     463.0|         1.82|           80100.0|
|  -114.56|   33.69|              17.0|      720.0|         174.0|     333.0|     117.0|       1.6509|           85700.0|
|  -114.57|   33.64|              14.0|     1501.0|         337.0|     515.0|     226.0|       3.1917|           73400.0|
|  -114.57|   33.57|              20.0|     1454.0|         326.0|     624.0|     262.0|        1.925|           65500.0|
+---------+--------+----

# Transform data structure

In [0]:
from pyspark.ml.linalg import Vectors
transformed_data=data.rdd.map(lambda x: [Vectors.dense(x[0:8]),x[-1]]).toDF(['features', 'label'])


In [14]:
from pyspark.ml.regression import LinearRegression
lr=LinearRegression(featuresCol = 'features', labelCol = 'label')


lr_model = lr.fit(transformed_data)

pred = lr_model.transform(transformed_data)
pred.show(5)

+--------------------+-------+-------------------+
|            features|  label|         prediction|
+--------------------+-------+-------------------+
|[-114.31,34.19,15...|66900.0|  7128.084577110596|
|[-114.47,34.4,19....|80100.0|  73675.60147899296|
|[-114.56,33.69,17...|85700.0|-31317.285251948982|
|[-114.57,33.64,14...|73400.0|  40802.46695013251|
|[-114.57,33.57,20...|65500.0|-4058.8060622261837|
+--------------------+-------+-------------------+
only showing top 5 rows



# Module evaluation

In [15]:
from pyspark.ml.evaluation import RegressionEvaluator 
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label')
evaluator.setMetricName('r2').evaluate(pred)

0.6413378529502702

# Linear regression with cross-validation

# Training and test datasets

In [0]:
training, test = transformed_data.randomSplit([0.8, 0.2], seed=123)


# Build cross-validation model

In [0]:
##=====build cross valiation model======

# estimator
lr = LinearRegression(featuresCol = 'features', labelCol = 'label')

# parameter grid
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder().\
    addGrid(lr.regParam, [0, 0.5, 1]).\
    addGrid(lr.elasticNetParam, [0, 0.5, 1]).\
    build()
    
# evaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label', metricName='r2')

# cross-validation model
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)

  # Fit cross-validation model

In [0]:
cv_model = cv.fit(training)

In [0]:

pred_training_cv = cv_model.transform(training)
pred_test_cv = cv_model.transform(test)

# Prediction And Evaluation

In [20]:
evaluator.setMetricName('r2').evaluate(pred_training_cv)

0.6431458767463152

In [21]:
evaluator.setMetricName('r2').evaluate(pred_test_cv)

0.6337166268003807

# Intercept and coefficients

In [22]:
print('Intercept: ', cv_model.bestModel.intercept, "\n",
     'coefficients: ', cv_model.bestModel.coefficients)

Intercept:  -3620541.9806647794 
 coefficients:  [-43106.20772616206,-42803.47173089207,1141.408946492884,-7.1828581615540354,107.40315473871928,-38.63249983670063,51.06386146142264,40381.66864335395]


# Get parameter values from the best model
Parameters can be extracted by calling the java property.

In [23]:
print('best regParam: ' + str(cv_model.bestModel._java_obj.getRegParam()) + "\n" +
     'best ElasticNetParam:' + str(cv_model.bestModel._java_obj.getElasticNetParam()))

best regParam: 1.0
best ElasticNetParam:1.0
