<a href="https://colab.research.google.com/github/rnomadic/Databricks_ML/blob/main/MLRegression2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## One hot encoder
from pyspark.ml.feature import OneHotEncoder, StringIndexer

categorical_cols = [field for (field, dataType) in train_df.dtypes if dataType == "string"]
index_output_cols = [x + "Index" for x in categorical_cols]
ohe_output_cols = [x + "OHE" for x in categorical_cols]

string_indexer = StringIndexer(inputCols=categorical_cols, outputCols=index_output_cols, handleInvalid="skip")
ohe_encoder = OneHotEncoder(inputCols=index_output_cols, outputCols=ohe_output_cols)

## Vector Assembler
from pyspark.ml.feature import VectorAssembler

numeric_cols = [field for (field, dataType) in train_df.dtypes if ((dataType == "double") & (field != "price"))]
assembler_inputs = ohe_output_cols + numeric_cols
vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

## Linear regression
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(labelCol="price", featuresCol="features")

## Pipeline
from pyspark.ml import Pipeline

stages = [string_indexer, ohe_encoder, vec_assembler, lr]
pipeline = Pipeline(stages=stages)

pipeline_model = pipeline.fit(train_df)

## Saving the model
pipeline_model.write().overwrite().save(working_dir)

## Loading the model
from pyspark.ml import PipelineModel

saved_pipeline_model = PipelineModel.load(working_dir)

## Apply model to test data
pred_df = saved_pipeline_model.transform(test_df)

display(pred_df.select("features", "price", "prediction"))

## Evaluate Model
from pyspark.ml.evaluation import RegressionEvaluator

regression_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price", metricName="rmse")

rmse = regression_evaluator.evaluate(pred_df)
r2 = regression_evaluator.setMetricName("r2").evaluate(pred_df)
print(f"RMSE is {rmse}")
print(f"R2 is {r2}")

## Linear Regression II Lab

## RFormula
from pyspark.ml import Pipeline
from pyspark.ml.feature import RFormula
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
r_formula = RFormula(
    formula="price ~ .", # want to take all the features
    featuresCol="features",
    labelCol="label")

lr = LinearRegression(labelCol="price", featuresCol="features")
stages = [r_formula, lr]
pipeline = Pipeline(stages=stages)
pipeline_model = pipeline.fit(train_df)
pred_df = pipeline_model.transform(test_df)
regression_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price", metricName="rmse")
#regression_evaluator = RegressionEvaluator(<FILL_IN>)

rmse = regression_evaluator.setMetricName("rmse").evaluate(pred_df)
r2 = regression_evaluator.setMetricName("r2").evaluate(pred_df)
print(f"RMSE is {rmse}")
print(f"R2 is {r2}")