In [1]:
%classpath add mvn com.salesforce.transmogrifai transmogrifai-core_2.11 0.5.1

In [2]:
%classpath add mvn org.apache.spark spark-mllib_2.11 2.3.0

In [3]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import org.apache.spark.sql.functions.udf

import com.salesforce.op._
import com.salesforce.op.features._
import com.salesforce.op.features.types._
import com.salesforce.op.evaluators.Evaluators

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import org.apache.spark.sql.functions.udf
import com.salesforce.op._
import com.salesforce.op.features._
import com.salesforce.op.features.types._
import com.salesforce.op.evaluators.Evaluators


In [4]:
import com.salesforce.op.OpWorkflow
import com.salesforce.op.evaluators.Evaluators
import com.salesforce.op.readers.DataReaders

import com.salesforce.op.OpWorkflow
import com.salesforce.op.evaluators.Evaluators
import com.salesforce.op.readers.DataReaders


In [5]:
val conf = new SparkConf().setMaster("local[*]").setAppName("HousingPricesPrediction")
implicit val spark = SparkSession.builder.config(conf).getOrCreate()

org.apache.spark.sql.SparkSession@3679a998

In [6]:
case class HousingPrices(
  lotFrontage: Double,
  area: Integer,
  lotShape: String,
  yrSold : Integer,
  saleType: String,
  saleCondition: String,
  salePrice: Double)

defined class HousingPrices


In [8]:
import org.apache.spark.sql.{Encoders}
implicit val srEncoder = Encoders.product[HousingPrices]
val saleTypeEncoder = Map("COD" -> 1, "CWD" -> 2, "Con" -> 3, "ConLD" -> 4,
      "ConLI" -> 5, "ConLw" -> 6,"New" -> 7, "Oth" -> 8,"WD" -> 9  )


In [9]:
val lotFrontage = FeatureBuilder.Real[HousingPrices].extract(_.lotFrontage.toReal).asPredictor
val area = FeatureBuilder.Integral[HousingPrices].extract(_.area.toIntegral).asPredictor

Feature(name = area, uid = Integral_000000000002, isResponse = false, originStage = FeatureGeneratorStage_000000000002, parents = [], distributions = [])

In [10]:
val lotShape = FeatureBuilder.Integral[HousingPrices].extract(x =>
    {
      var y = 0
      if(x.lotShape.equals("IR1")){
        y = 1
      }else{
        y = 0
      }
      y.toIntegral
    }).asPredictor

Feature(name = lotShape, uid = Integral_000000000003, isResponse = false, originStage = FeatureGeneratorStage_000000000003, parents = [], distributions = [])

In [11]:
val yrSold = FeatureBuilder.Integral[HousingPrices].extract(x =>
      {
        var y =  2019 - x.yrSold
        y.toIntegral
      }
    ).asPredictor

Feature(name = yrSold, uid = Integral_000000000004, isResponse = false, originStage = FeatureGeneratorStage_000000000004, parents = [], distributions = [])

In [12]:
 val saleType = FeatureBuilder.Integral[HousingPrices].extract(x =>
      {
        val y = x.saleType
        val z = saleTypeEncoder.get(y)
        z.toIntegral
      }
    ).asPredictor

Feature(name = saleType, uid = Integral_000000000005, isResponse = false, originStage = FeatureGeneratorStage_000000000005, parents = [], distributions = [])

In [13]:
val saleConditionEncoder = Map("Abnorml" -> 1, "AdjLand" -> 2, "Alloca" -> 3, "Family" -> 4,
      "Normal" -> 5, "Partial" -> 6 )

In [14]:
val saleCondition = FeatureBuilder.Integral[HousingPrices].extract(x =>
    {
      val y = x.saleCondition
      val z = saleConditionEncoder.get(y)
      z.toIntegral
    }
    ).asPredictor

Feature(name = saleCondition, uid = Integral_000000000006, isResponse = false, originStage = FeatureGeneratorStage_000000000006, parents = [], distributions = [])

In [15]:
val salePrice = FeatureBuilder.RealNN[HousingPrices].extract(_.salePrice.toRealNN).asResponse

Feature(name = salePrice, uid = RealNN_000000000007, isResponse = true, originStage = FeatureGeneratorStage_000000000007, parents = [], distributions = [])

In [16]:
 val trainFilePath = "../src/main/resources/HousingPricesDataset/train_lf_la_ls_ys_st_sc.csv"

../src/main/resources/HousingPricesDataset/train_lf_la_ls_ys_st_sc.csv

In [17]:
val trainDataReader = DataReaders.Simple.csvCase[HousingPrices](
      path = Option(trainFilePath)
    )

com.salesforce.op.readers.CSVProductReader@488e50ef

In [19]:
import com.salesforce.op.stages.impl.tuning.{DataCutter, DataSplitter}
val features = Seq(lotFrontage,area,lotShape, yrSold, saleType, saleCondition).transmogrify()
val randomSeed = 42L
val splitter = DataSplitter(seed = randomSeed)

DataSplitter_00000000000b

In [21]:
import com.salesforce.op.stages.impl.regression.RegressionModelSelector
import com.salesforce.op.stages.impl.regression.RegressionModelsToTry.{OpGBTRegressor, OpRandomForestRegressor}

val prediction1 = RegressionModelSelector
      .withCrossValidation(
        dataSplitter = Some(splitter), seed = randomSeed,
        modelTypesToUse = Seq(OpGBTRegressor, OpRandomForestRegressor)
      ).setInput(salePrice,features).getOutput()

Feature(name = area-lotFrontage-lotShape-saleCondition-salePrice-saleType-yrSold_4-stagesApplied_Prediction_000000000014, uid = Prediction_000000000014, isResponse = true, originStage = ModelSelector_000000000014, parents = [RealNN_000000000007,OPVector_00000000000a], distributions = [])

In [22]:
val evaluator = Evaluators.Regression().setLabelCol(salePrice).setPredictionCol(prediction1)

OpRegressionEvaluator_000000000015

In [23]:
val workflow = new OpWorkflow().setResultFeatures(prediction1, salePrice).setReader(trainDataReader)
val workflowModel = workflow.train()

com.salesforce.op.OpWorkflowModel@2830e00b

In [25]:
val dfScoreAndEvaluate = workflowModel.scoreAndEvaluate(evaluator)
dfScoreAndEvaluate._1.show(false)

+--------------------+---------+---------------------------------------------------------------------------------------------------------+
|key                 |salePrice|area-lotFrontage-lotShape-saleCondition-salePrice-saleType-yrSold_4-stagesApplied_Prediction_000000000014|
+--------------------+---------+---------------------------------------------------------------------------------------------------------+
|182800717145872802  |208500.0 |[prediction -> 162938.1206683031]                                                                        |
|-1260032861651078847|181500.0 |[prediction -> 178249.78987439617]                                                                       |
|-368029859182554265 |223500.0 |[prediction -> 202723.0848026669]                                                                        |
|-8311040069960624803|140000.0 |[prediction -> 152425.85265489892]                                                                       |
|-7894344379705295783|25000

null

In [26]:
val dfEvaluate = dfScoreAndEvaluate._2
dfEvaluate.toString()

{
  "RootMeanSquaredError" : 61714.05980771341,
  "MeanSquaredError" : 3.8086251779500284E9,
  "R2" : 0.45184036907553027,
  "MeanAbsoluteError" : 43901.85329539395
}