## Loading the libraries

In [None]:
%classpath add mvn com.salesforce.transmogrifai transmogrifai-core_2.11 0.6.1

In [None]:
%classpath add mvn org.apache.spark spark-mllib_2.11 2.3.2

## Import the classes

In [None]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import org.apache.spark.sql.functions.udf

import com.salesforce.op._
import com.salesforce.op.features._
import com.salesforce.op.features.types._
import com.salesforce.op.stages.impl.classification._
import com.salesforce.op.evaluators.Evaluators

In [None]:
import com.salesforce.op.OpWorkflow
import com.salesforce.op.evaluators.Evaluators
import com.salesforce.op.readers.DataReaders

## Instantiate Spark

In [None]:
val conf = null
implicit val spark = null

## Schema and Feature Creation

In [None]:
//room_type	neighbourhood	number_of_reviews	price
case class SimpleRegression (

)

In [None]:
val roomType = FeatureBuilder.Text[SimpleRegression].extract(_.roomType.toText).asPredictor
val neighbourhood = FeatureBuilder.Text[SimpleRegression].extract(_.neighbourhood.toText).asPredictor
val numberOfReviews = FeatureBuilder.Integral[SimpleRegression].extract(_.numberOfReviews.toIntegral).asPredictor

val price = FeatureBuilder.RealNN[SimpleRegression].extract(_.price.toRealNN).asResponse

## Load the data

In [None]:
import spark.implicits._

val trainFilePath = "./data/listing_three_features.csv"
val trainDataReader = null

In [None]:
// check that path exists
scala.reflect.io.File(trainFilePath).exists

In [None]:
import com.salesforce.op.stages.impl.tuning.{DataCutter, DataSplitter}
val features = null
val randomSeed = 42L
val splitter = DataSplitter(seed = randomSeed)

## Model Selector
The ModelSelector is an Estimator that uses data to find the best model. 

In [None]:
import com.salesforce.op.stages.impl.regression.RegressionModelsToTry.{OpGBTRegressor, OpRandomForestRegressor,OpLinearRegression}
import com.salesforce.op.stages.impl.regression.RegressionModelsToTry.{OpLinearRegression}
import com.salesforce.op.stages.impl.regression.RegressionModelSelector

val cutter = DataCutter(reserveTestFraction = 0.2, seed = randomSeed)

val prediction = RegressionModelSelector
      .withCrossValidation(
        dataSplitter = Some(splitter), seed = randomSeed,
        //modelTypesToUse = Seq(OpGBTRegressor, OpRandomForestRegressor)
        //modelTypesToUse = Seq(OpLinearRegression, 
        //                      OpRandomForestRegressor)
        modelTypesToUse = null
        
 
).setInput(price,features).getOutput()

## Evaluators and Workflow
Factory that performs the evaluation of metrics for regression. The metrics returned are rmse, mse, r2 and mae.
* Mean Squared Error (MSE)	
* Root Mean Squared Error (RMSE)	
* Mean Absolute Error (MAE)	
* Coefficient of Determination 

OpWorkflows create and transform the raw data needed to compute Features fed into them. In addition they optimize the application of Stages needed to create the final Features ensuring optimal computations within the full pipeline DAG. OpWorkflows can be fit to a given dataset using the .train() method. This produces an OpWorkflowModel which can then be saved to disk and applied to another dataset.

In [None]:
val evaluator = null

In [None]:
val workflow = null
val workflowModel = workflow.train()

## Score and evaluate

In [None]:
val dfScoreAndEvaluate = null
val dfScore = dfScoreAndEvaluate._1
val dfEvaluate = dfScoreAndEvaluate._2
println("Evaluate:\n" + dfEvaluate.toString())

dfScore.show(false)