In [ ]:
val a = 1

# Machine Learning By Example

In [ ]:
:sh head -1 /opt/SparkDatasets/topics/abstracts.csv

In [ ]:
case class LabeledAbstract(track: String, title: String, text: String)

In [ ]:
import org.apache.spark.sql.types.StructType

val labeledAbstractSchema: StructType = Encoders.product[LabeledAbstract].schema

In [ ]:
val inputData: Dataset[LabeledAbstract] =
  sparkSession.read
              .schema(labeledAbstractSchema)
              .option("delimiter", ";")
              .csv("/opt/SparkDatasets/topics/abstracts.csv")
              .as[LabeledAbstract]

inputData.cache

In [ ]:
import org.apache.spark.ml.feature.{StringIndexer, StringIndexerModel}

val stringIndexerModel: StringIndexerModel =
  (new StringIndexer).setInputCol("track")
                     .setOutputCol("indexedLabel")
                     .fit(inputData)

In [ ]:
stringIndexerModel.labels

In [ ]:
stringIndexerModel.labels.zipWithIndex

In [ ]:
val afterStringIndexer: DataFrame =
  stringIndexerModel.transform(inputData)
                    .select('indexedLabel, 'title, 'text)

afterStringIndexer.cache

In [ ]:
import org.apache.spark.ml.feature.IndexToString

val indexToString: IndexToString =
  (new IndexToString).setInputCol("prediction")
                     .setOutputCol("predictionLabel")
                     .setLabels(stringIndexerModel.labels)

In [ ]:
import org.apache.spark.ml.feature.SQLTransformer

val sqlTransformer: SQLTransformer =
  (new SQLTransformer).setStatement("""SELECT indexedLabel,
                                              concat(title, ' ' , text) AS titleAndText
                                       FROM __THIS__""")

In [ ]:
val afterSqlTransformer: DataFrame =
  sqlTransformer.transform(afterStringIndexer)

afterSqlTransformer.cache

In [ ]:
import org.apache.spark.ml.feature.RegexTokenizer

val regexTokenizer: RegexTokenizer =
  (new RegexTokenizer).setInputCol("titleAndText")
                      .setOutputCol("lowercaseTokens")
                      .setPattern("\\W+")

In [ ]:
val afterRegexTokenizer =
  regexTokenizer.transform(afterSqlTransformer)
                .select('indexedLabel, 'lowercaseTokens)

afterRegexTokenizer.cache

In [ ]:
import org.apache.spark.ml.feature.StopWordsRemover

val stopWordsRemover =
  (new StopWordsRemover).setInputCol(regexTokenizer.getOutputCol)
                        .setOutputCol("filteredWords")

In [ ]:
val afterStopWordsRemover =
  stopWordsRemover.transform(afterRegexTokenizer)
                  .select('indexedLabel, 'filteredWords)

afterStopWordsRemover.cache

In [ ]:
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.ml.param.ParamMap

import org.apache.spark.sql.types.{StructType, StructField, ArrayType, StringType}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf

class CustomStemmer(override val uid: String) extends Transformer {
  def this() = this(Identifiable.randomUID("CustomStemmer"))

  def copy(extra: ParamMap): CustomStemmer = defaultCopy(extra)

  override def transformSchema(schema: StructType): StructType = {
    schema.add(StructField("stemmedWords", ArrayType(StringType)))
  }

  override def transform(dset: Dataset[_]): DataFrame = {
    import opennlp.tools.stemmer.snowball.SnowballStemmer

    val stem = udf {
      filteredWords: Seq[String] => {
        val opennlpStemmer = new SnowballStemmer(SnowballStemmer.ALGORITHM.ENGLISH)
        filteredWords.map(opennlpStemmer.stem(_).toString) }
    }
    dset.select( dset.col("indexedLabel"), stem(dset.col("filteredWords")).as("stemmedWords") )
  }
}

In [ ]:
val customStemmer: CustomStemmer = new CustomStemmer

In [ ]:
val afterCustomStemmer =
  customStemmer.transform(afterStopWordsRemover)
               .select('indexedLabel, 'stemmedWords)

afterCustomStemmer.cache

In [ ]:
import org.apache.spark.ml.feature.NGram

val bigramBuilder =
  (new NGram).setN(2)
             .setInputCol("stemmedWords")
             .setOutputCol("stemmedWordBigrams")

In [ ]:
val afterBigramBuilder =
  bigramBuilder.transform(afterCustomStemmer)

afterBigramBuilder.printSchema

afterBigramBuilder.cache

In [ ]:
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.ml.param.ParamMap

import org.apache.spark.sql.types.{StructType, StructField, ArrayType, StringType}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf

class StemsAndBigramsConcatenator(override val uid: String) extends Transformer {
  def this() = this(Identifiable.randomUID("StemsAndBigramsConcatenator"))

  def copy(extra: ParamMap): StemsAndBigramsConcatenator = defaultCopy(extra)

  override def transformSchema(schema: StructType): StructType = {
    schema.add(StructField("stemsAndBigrams", ArrayType(StringType)))
  }

  override def transform(dset: Dataset[_]): DataFrame = {
    val concatenateArrayColumns = udf { (a1: Seq[String], a2: Seq[String]) => a1 ++ a2 }
    dset.select( dset.col("indexedLabel"),
                 concatenateArrayColumns(dset.col("stemmedWords"), dset.col("stemmedWordBigrams")).as("stemsAndBigrams") )
  }
}

In [ ]:
val stemsAndBigramsConcatenator: StemsAndBigramsConcatenator = new StemsAndBigramsConcatenator

In [ ]:
val afterStemsAndBigramsConcatenator =
  stemsAndBigramsConcatenator.transform(afterBigramBuilder)
                             .select('indexedLabel, 'stemsAndBigrams)

afterStemsAndBigramsConcatenator.cache

In [ ]:
import org.apache.spark.ml.feature.HashingTF

val hashingTF =
  (new HashingTF).setInputCol("stemsAndBigrams")
                 .setOutputCol("rawFeatures")
                 .setNumFeatures(32768)

In [ ]:
val afterHashingTF =
  hashingTF.transform(afterStemsAndBigramsConcatenator)
           .select('indexedLabel, 'rawFeatures)

afterHashingTF.cache

In [ ]:
import org.apache.spark.ml.feature.IDF

val idfModel =
  (new IDF).setInputCol("rawFeatures")
           .setOutputCol("features")
           .fit(afterHashingTF)

In [ ]:
val afterIDF =
  idfModel.transform(afterHashingTF)
          .select('indexedLabel, 'features)

afterIDF.cache

In [ ]:
import org.apache.spark.ml.classification.NaiveBayes

val naiveBayes =
  (new NaiveBayes).setLabelCol("indexedLabel")
                  .setFeaturesCol("features")
                  .setSmoothing(1.0)
                  .setModelType("multinomial")

In [ ]:
// import org.apache.spark.ml.classification.RandomForestClassifier

// val randomForest =
//   (new RandomForestClassifier).setLabelCol("indexedLabel")
//                               .setFeaturesCol("features")
//                               .setNumTrees(157)
//                               .setMaxDepth(29)
//                               .setSeed(1234L)

In [ ]:
import org.apache.spark.ml.Pipeline

val pipeline =
  (new Pipeline).setStages(Array(stringIndexerModel,
                                 sqlTransformer,
                                 regexTokenizer,
                                 stopWordsRemover,
                                 customStemmer,
                                 bigramBuilder,
                                 stemsAndBigramsConcatenator,
                                 hashingTF,
                                 idfModel,
                                 naiveBayes,
                                 indexToString))

In [ ]:
val Array(trainData, testData) = inputData.randomSplit(Array(0.6, 0.4), seed = 1234L)

In [ ]:
val model = pipeline.fit(trainData)

In [ ]:
val trainPredictions = model.transform(trainData)

trainPredictions.printSchema

In [ ]:
val testPredictions = model.transform(testData)

testPredictions.select('prediction, 'indexedLabel)

In [ ]:
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

val evaluator =
  (new MulticlassClassificationEvaluator).setLabelCol("indexedLabel")
                                         .setPredictionCol("prediction")
                                         .setMetricName("accuracy")

In [ ]:
val trainAccuracy = evaluator.evaluate(trainPredictions)
val testAccuracy = evaluator.evaluate(testPredictions)

In [ ]:
val newAbstractTitle = "Microservices Deployment on the AWS Cloud"

val part1 = "You will see how continuous deployment of microservices on the AWS platform"
val part2 = "is performed in a repeatable and reliable fashion"
val part3 = "with an emphasis on reliability, scalability, monitoring, and security"

val newAbstractText = s"${part1} ${part2} ${part3}"

val newObservation =
  sparkSession.createDataFrame( Seq( (999, newAbstractTitle, newAbstractText) ) )
              .toDF("indexedLabel", "title","text")

model.transform(newObservation).select('predictionLabel, 'stemsAndBigrams)

In [ ]:
val newAbstractTitle = "Machine Learning At Scale With Spark"

val newAbstractText = "This talk is for aspiring data scientists"

val newObservation =
  sparkSession.createDataFrame( Seq( (999, newAbstractTitle, newAbstractText) ) )
              .toDF("indexedLabel", "title","text")

model.transform(newObservation).select('predictionLabel, 'stemsAndBigrams)