In [ ]:
val a = 1

In [ ]:
sparkSession.conf.set("spark.executor.extraClassPath", "/jars/opennlp-tools.jar")

In [ ]:
:sh head -1 /opt/SparkDatasets/topics/abstracts.csv

In [ ]:
case class LabeledAbstract(track: String, title: String, text: String)

In [ ]:
import org.apache.spark.sql.types.StructType

val labeledAbstractSchema: StructType = Encoders.product[LabeledAbstract].schema

In [ ]:
val data: Dataset[LabeledAbstract] =
  sparkSession.read
              .schema(labeledAbstractSchema)
              .option("delimiter", ";")
              .csv("/opt/SparkDatasets/topics/abstracts.csv")
              .as[LabeledAbstract]

data.cache

In [ ]:
import org.apache.spark.ml.feature.{StringIndexer, StringIndexerModel}

val stringIndexerModel: StringIndexerModel =
  (new StringIndexer).setInputCol("track")
                     .setOutputCol("indexedLabel")
                     .fit(data)

In [ ]:
stringIndexerModel.labels

In [ ]:
val indexed: DataFrame =
  stringIndexerModel.transform(data)
                    .select('indexedLabel, 'title, 'text)

indexed.cache

In [ ]:
import org.apache.spark.ml.feature.IndexToString

val indexToString: IndexToString =
  (new IndexToString).setInputCol("prediction")
                     .setOutputCol("predictionLabel")
                     .setLabels(stringIndexerModel.labels)

In [ ]:
import org.apache.spark.ml.feature.SQLTransformer

val sqlTransformer: SQLTransformer =
  (new SQLTransformer).setStatement("""SELECT indexedLabel,
                                              concat(title, ' ' , text) AS titleAndText
                                       FROM __THIS__""")

In [ ]:
val transformed: DataFrame =
  sqlTransformer.transform(indexed)

transformed.cache

In [ ]:
import org.apache.spark.ml.feature.RegexTokenizer

val regexTokenizer: RegexTokenizer =
  (new RegexTokenizer).setInputCol("titleAndText")
                      .setOutputCol("words")
                      .setPattern("\\W+")

In [ ]:
val words =
  regexTokenizer.transform(transformed)
                .select('indexedLabel, 'words)

words.cache

In [ ]:
import opennlp.tools.stemmer.snowball.SnowballStemmer
import opennlp.tools.stemmer.snowball.SnowballStemmer.ALGORITHM

In [ ]:
val opennlpStemmer = new SnowballStemmer(ALGORITHM.ENGLISH) with Serializable

In [ ]:
def bulkStemmer(opennlpStemmer: SnowballStemmer)(stringArray: Seq[String]): Seq[String] = {
  stringArray.map(opennlpStemmer.stem(_).toString)
}

In [ ]:
sparkSession.udf.register( "bulkStemmer", bulkStemmer(opennlpStemmer) _ )

In [ ]:
words.withColumn("stemmedWords", bulkStemmer(opennlpStemmer)('words))

In [ ]:
val stemmerTransformer: SQLTransformer =
  (new SQLTransformer).setStatement("""SELECT *,
                                              bulkStemmer(words)
                                       FROM __THIS__""")

In [ ]:
import org.apache.spark.sql.Row

In [ ]:
words.rdd.mapPartitions(f).first

In [ ]:
def f(rowIterator: Iterator[Row]): Iterator[Row] = {
  import opennlp.tools.stemmer.snowball.SnowballStemmer
  import opennlp.tools.stemmer.snowball.SnowballStemmer.ALGORITHM
  val opennlpStemmer = new SnowballStemmer(ALGORITHM.ENGLISH) with Serializable
  rowIterator.map { case Row(indexedLabel: Double, word: String) => Row(indexedLabel, opennlpStemmer.stem(word).toString) }
}