In [23]:
%classpath add mvn com.salesforce.transmogrifai transmogrifai-core_2.11 0.6.0

In [24]:
%classpath add mvn org.apache.spark spark-mllib_2.11 2.3.0

In [25]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import org.apache.spark.sql.functions.udf

import com.salesforce.op._
import com.salesforce.op.features._
import com.salesforce.op.features.types._
import com.salesforce.op.evaluators.Evaluators

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import org.apache.spark.sql.functions.udf
import com.salesforce.op._
import com.salesforce.op.features._
import com.salesforce.op.features.types._
import com.salesforce.op.evaluators.Evaluators


In [26]:
import com.salesforce.op.OpWorkflow
import com.salesforce.op.evaluators.Evaluators
import com.salesforce.op.readers.DataReaders

import com.salesforce.op.OpWorkflow
import com.salesforce.op.evaluators.Evaluators
import com.salesforce.op.readers.DataReaders


In [27]:
val conf = new SparkConf().setMaster("local[*]").setAppName("HousingPricesPrediction")
implicit val spark = SparkSession.builder.config(conf).getOrCreate()

org.apache.spark.sql.SparkSession@2c028208

In [28]:
case class HousingPrices(
  lotFrontage: Double,
  area: Integer,
  lotShape: String,
  yrSold : Integer,
  saleType: String,
  saleCondition: String,
  salePrice: Double)

defined class HousingPrices


In [29]:
import org.apache.spark.sql.{Encoders}
implicit val srEncoder = Encoders.product[HousingPrices]
val saleTypeEncoder = Map("COD" -> 1, "CWD" -> 2, "Con" -> 3, "ConLD" -> 4,
      "ConLI" -> 5, "ConLw" -> 6,"New" -> 7, "Oth" -> 8,"WD" -> 9  )


In [30]:
val lotFrontage = FeatureBuilder.Real[HousingPrices].extract(_.lotFrontage.toReal).asPredictor
val area = FeatureBuilder.Integral[HousingPrices].extract(_.area.toIntegral).asPredictor

Feature(name = area, uid = Integral_00000000000c, isResponse = false, originStage = FeatureGeneratorStage_00000000000c, parents = [], distributions = [])

In [31]:
val lotShape = FeatureBuilder.Integral[HousingPrices].extract(x =>
    {
      var y = 0
      if(x.lotShape.equals("IR1")){
        y = 1
      }else{
        y = 0
      }
      y.toIntegral
    }).asPredictor

Feature(name = lotShape, uid = Integral_00000000000d, isResponse = false, originStage = FeatureGeneratorStage_00000000000d, parents = [], distributions = [])

In [32]:
val yrSold = FeatureBuilder.Integral[HousingPrices].extract(x =>
      {
        var y =  2019 - x.yrSold
        y.toIntegral
      }
    ).asPredictor

Feature(name = yrSold, uid = Integral_00000000000e, isResponse = false, originStage = FeatureGeneratorStage_00000000000e, parents = [], distributions = [])

In [33]:
 val saleType = FeatureBuilder.Integral[HousingPrices].extract(x =>
      {
        val y = x.saleType
        val z = saleTypeEncoder.get(y)
        z.toIntegral
      }
    ).asPredictor

Feature(name = saleType, uid = Integral_00000000000f, isResponse = false, originStage = FeatureGeneratorStage_00000000000f, parents = [], distributions = [])

In [34]:
val saleConditionEncoder = Map("Abnorml" -> 1, "AdjLand" -> 2, "Alloca" -> 3, "Family" -> 4,
      "Normal" -> 5, "Partial" -> 6 )

In [35]:
val saleCondition = FeatureBuilder.Integral[HousingPrices].extract(x =>
    {
      val y = x.saleCondition
      val z = saleConditionEncoder.get(y)
      z.toIntegral
    }
    ).asPredictor

Feature(name = saleCondition, uid = Integral_000000000010, isResponse = false, originStage = FeatureGeneratorStage_000000000010, parents = [], distributions = [])

In [36]:
val salePrice = FeatureBuilder.RealNN[HousingPrices].extract(_.salePrice.toRealNN).asResponse

Feature(name = salePrice, uid = RealNN_000000000011, isResponse = true, originStage = FeatureGeneratorStage_000000000011, parents = [], distributions = [])

In [37]:
 val trainFilePath = "../src/main/resources/HousingPricesDataset/train_lf_la_ls_ys_st_sc.csv"

../src/main/resources/HousingPricesDataset/train_lf_la_ls_ys_st_sc.csv

In [38]:
val trainDataReader = DataReaders.Simple.csvCase[HousingPrices](
      path = Option(trainFilePath)
    )

com.salesforce.op.readers.CSVProductReader@6b06d5a9

In [39]:

val features = Seq(lotFrontage,area,lotShape, yrSold, saleType, saleCondition).transmogrify()


Feature(name = area-lotFrontage-lotShape-saleCondition-saleType-yrSold_3-stagesApplied_OPVector_000000000014, uid = OPVector_000000000014, isResponse = false, originStage = VectorsCombiner_000000000014, parents = [OPVector_000000000012,OPVector_000000000013], distributions = [])

In [45]:
import com.salesforce.op.stages.impl.tuning.{DataCutter, DataSplitter}
val randomSeed = 42L
val splitter = DataSplitter(seed = randomSeed)

DataSplitter_000000000015

In [46]:
import com.salesforce.op.stages.impl.regression.RegressionModelSelector
import com.salesforce.op.stages.impl.regression.RegressionModelsToTry.{OpGBTRegressor, OpRandomForestRegressor}

val prediction1 = RegressionModelSelector
      .withCrossValidation(
        dataSplitter = Some(splitter), seed = randomSeed,
        modelTypesToUse = Seq(OpGBTRegressor, OpRandomForestRegressor)
      ).setInput(salePrice,features).getOutput()

Feature(name = area-lotFrontage-lotShape-saleCondition-salePrice-saleType-yrSold_4-stagesApplied_Prediction_00000000001e, uid = Prediction_00000000001e, isResponse = true, originStage = ModelSelector_00000000001e, parents = [RealNN_000000000011,OPVector_000000000014], distributions = [])

In [47]:
val evaluator = Evaluators.Regression().setLabelCol(salePrice).setPredictionCol(prediction1)

OpRegressionEvaluator_00000000001f

In [48]:
val workflow = new OpWorkflow().setResultFeatures(prediction1, salePrice).setReader(trainDataReader)
val workflowModel = workflow.train()

com.salesforce.op.OpWorkflowModel@36bb935b

In [49]:
val dfScoreAndEvaluate = workflowModel.scoreAndEvaluate(evaluator)
dfScoreAndEvaluate._1.show(false)

+--------------------+---------+---------------------------------------------------------------------------------------------------------+
|key                 |salePrice|area-lotFrontage-lotShape-saleCondition-salePrice-saleType-yrSold_4-stagesApplied_Prediction_00000000001e|
+--------------------+---------+---------------------------------------------------------------------------------------------------------+
|-4303885519363832332|208500.0 |[prediction -> 162767.32467901072]                                                                       |
|6481802382031499884 |181500.0 |[prediction -> 171742.34944370584]                                                                       |
|-1351952053996689297|223500.0 |[prediction -> 205267.20891450963]                                                                       |
|7082539848750989087 |140000.0 |[prediction -> 154184.0170539833]                                                                        |
|-7199768425040789486|25000

null

In [50]:
val dfEvaluate = dfScoreAndEvaluate._2
dfEvaluate.toString()

{
  "RootMeanSquaredError" : 62279.68927853887,
  "MeanSquaredError" : 3.8787596966313496E9,
  "R2" : 0.44174619858640085,
  "MeanAbsoluteError" : 43644.811995952376
}