In [None]:
%classpath add mvn com.salesforce.transmogrifai transmogrifai-core_2.11 0.5.1

In [None]:
%classpath add mvn org.apache.spark spark-mllib_2.11 2.3.0

In [None]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import org.apache.spark.sql.functions.udf

import com.salesforce.op._
import com.salesforce.op.features._
import com.salesforce.op.features.types._
import com.salesforce.op.evaluators.Evaluators

In [None]:
import com.salesforce.op.OpWorkflow
import com.salesforce.op.evaluators.Evaluators
import com.salesforce.op.readers.DataReaders

In [None]:
val conf = new SparkConf().setMaster("local[*]").setAppName("PimaIndiansClassification")
implicit val spark = SparkSession.builder.config(conf).getOrCreate()

In [None]:
case class PimaIndians
(
  numberOfTimesPreg: Double,
  plasmaGlucose: Double,
  bp: Double,
  spinThickness: Double,
  serumInsulin: Double,
  bmi: Double,
  diabetesPredigree : Double,
  ageInYrs : Double,
  piClass: String
)

In [None]:
val numberOfTimesPreg = FeatureBuilder.Real[PimaIndians].extract(_.numberOfTimesPreg.toReal).asPredictor
val plasmaGlucose = FeatureBuilder.Real[PimaIndians].extract(_.plasmaGlucose.toReal).asPredictor
val bp = FeatureBuilder.Real[PimaIndians].extract(_.bp.toReal).asPredictor
val spinThickness = FeatureBuilder.Real[PimaIndians].extract(_.spinThickness.toReal).asPredictor
val serumInsulin = FeatureBuilder.Real[PimaIndians].extract(_.serumInsulin.toReal).asPredictor
val bmi = FeatureBuilder.Real[PimaIndians].extract(_.bmi.toReal).asPredictor
val diabetesPredigree = FeatureBuilder.Real[PimaIndians].extract(_.diabetesPredigree.toReal).asPredictor
val ageInYrs = FeatureBuilder.Real[PimaIndians].extract(_.diabetesPredigree.toReal).asPredictor
val piClass = FeatureBuilder.Text[PimaIndians].extract(_.piClass.toText).asResponse

In [None]:
 val trainFilePath = "../src/main/resources/PimaIndiansDataset/primaindiansdiabetes.data"

In [None]:
import com.salesforce.op.features.FeatureBuilder
import com.salesforce.op.features.types._

In [None]:
import spark.implicits._ 
val trainDataReader = DataReaders.Simple.csvCase[PimaIndians](
      path = Option(trainFilePath)
    )

In [None]:
import com.salesforce.op.stages.impl.tuning.{DataCutter, DataSplitter}

val features = Seq( numberOfTimesPreg, plasmaGlucose,bp,spinThickness,serumInsulin,
    bmi,diabetesPredigree,ageInYrs).transmogrify()
val randomSeed = 42L
val splitter = DataSplitter(seed = randomSeed)

In [None]:
import org.apache.spark.sql.Encoders

implicit val piEncoder = Encoders.product[PimaIndians]

val piReader = DataReaders.Simple.csvCase[PimaIndians]()
val labels = piClass.indexed()

In [None]:
import com.salesforce.op.stages.impl.classification.MultiClassificationModelSelector
import com.salesforce.op.stages.impl.tuning.DataCutter

val cutter = DataCutter(reserveTestFraction = 0.2, seed = randomSeed)
val prediction = MultiClassificationModelSelector
    .withCrossValidation(splitter = Option(cutter), seed = randomSeed)
    .setInput(labels, features).getOutput()

In [None]:
val evaluator = Evaluators.MultiClassification.f1().setLabelCol(labels).setPredictionCol(prediction)

In [None]:
val workflow = new OpWorkflow().setResultFeatures(prediction, labels).setReader(trainDataReader)

In [None]:
val workflowModel = workflow.train()

In [None]:
val dfScoreAndEvaluate = workflowModel.scoreAndEvaluate(evaluator)
dfScoreAndEvaluate._1.show(false)

In [None]:
val dfEvaluate = dfScoreAndEvaluate._2
dfEvaluate.toString()