In [1]:
println(s"Current spark version is ${spark.version}")

Current spark version is 2.4.4


In [2]:
import org.apache.spark.sql.types.{StructType, StructField, IntegerType, LongType, StringType}

val dataSchema = new StructType()
    .add("target", IntegerType)
    .add("id", LongType)
    .add("raw_timestamp", StringType)
    .add("query_status", StringType)
    .add("author", StringType)
    .add("tweet", StringType)

val dataPath= "/home/jovyan/data/training.1600000.processed.noemoticon.csv"

val raw_sentiment = spark.read
    .format("csv")
    .option("header",false)
    .schema(dataSchema)
    .load(dataPath)
    .selectExpr("(case when target=4 then 1 else 0 end) as label","tweet")

raw_sentiment.groupBy($"label").count.show

+-----+------+
|label| count|
+-----+------+
|    1|800000|
|    0|800000|
+-----+------+



dataSchema = StructType(StructField(target,IntegerType,true), StructField(id,LongType,true), StructField(raw_timestamp,StringType,true), StructField(query_status,StringType,true), StructField(author,StringType,true), StructField(tweet,StringType,true))
dataPath = /home/jovyan/data/training.1600000.processed.noemoticon.csv
raw_sentiment = [label: int, tweet: string]


[label: int, tweet: string]

In [3]:
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.feature.{HashingTF, Tokenizer, IndexToString, StringIndexer, VectorIndexer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row

val tokenizer = new Tokenizer()
    .setInputCol("tweet")
    .setOutputCol("words")
//    .fit(raw_sentiment)

val hashingTF = new HashingTF()
    .setNumFeatures(1000)
    .setInputCol(tokenizer.getOutputCol)
    .setOutputCol("features")
//    .fit(raw_sentiment)

// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
val labelIndexer = new StringIndexer()
    .setInputCol("label")
    .setOutputCol("indexedLabel")
    .fit(raw_sentiment)

// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
val featureIndexer = new VectorIndexer()
    .setInputCol(hashingTF.getOutputCol)
    .setOutputCol("indexedFeatures")
    .setMaxCategories(4)
//    .fit(raw_sentiment)

// Train a RandomForest model.
val rf = new RandomForestClassifier()
    .setLabelCol(labelIndexer.getOutputCol)
    .setFeaturesCol(featureIndexer.getOutputCol)
    .setNumTrees(10)

// Convert indexed labels back to original labels.
val labelConverter = new IndexToString()
    .setInputCol("prediction")
    .setOutputCol("predictedLabel")
    .setLabels(labelIndexer.labels)

// Chain indexers and forest in a Pipeline.
val pipeline = new Pipeline()
      .setStages(Array(tokenizer, hashingTF, labelIndexer, featureIndexer, rf, labelConverter))

tokenizer = tok_fb51cbf28df6
hashingTF = hashingTF_04515dafe771
labelIndexer = strIdx_d8c1b4c2d287
featureIndexer = vecIdx_c15cf1d312ff
rf = rfc_42b417cbdceb
labelConverter = idxToStr_f6cf56d11ed3


...


idxToStr_f6cf56d11ed3

In [4]:
val model = pipeline.fit(raw_sentiment)

model = pipeline_fafabf1551e4


pipeline_fafabf1551e4

In [5]:
model.write.overwrite().save("/home/jovyan/models/spark-ml-model")

In [6]:
val sameModel = PipelineModel.load("/home/jovyan/models/spark-ml-model")

sameModel = pipeline_fafabf1551e4


pipeline_fafabf1551e4

In [7]:
val predictionsDF = sameModel.transform(raw_sentiment)

predictionsDF.show()

+-----+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+----------+--------------+
|label|               tweet|               words|            features|indexedLabel|     indexedFeatures|       rawPrediction|         probability|prediction|predictedLabel|
+-----+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+----------+--------------+
|    0|@switchfoot http:...|[@switchfoot, htt...|(1000,[7,14,21,54...|         0.0|(1000,[7,14,21,54...|[4.43320269021225...|[0.44332026902122...|       1.0|             1|
|    0|is upset that he ...|[is, upset, that,...|(1000,[170,193,22...|         0.0|(1000,[170,193,22...|[5.05946061480078...|[0.50594606148007...|       0.0|             0|
|    0|@Kenichan I dived...|[@kenichan, i, di...|(1000,[10,36,77,1...|         0.0|(1000,[10,36,77,1...|[5.31862574694048...|[0.5318625

predictionsDF = [label: int, tweet: string ... 8 more fields]


[label: int, tweet: string ... 8 more fields]

In [8]:
// predictionsDF.schema
// predictionsDF.describe()
// predictionsDF.stat
predictionsDF.printSchema()

root
 |-- label: integer (nullable = false)
 |-- tweet: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- indexedLabel: double (nullable = false)
 |-- indexedFeatures: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)
 |-- predictedLabel: string (nullable = true)



In [12]:
import org.apache.spark.sql.functions._

val getProbability = udf((prediction: org.apache.spark.ml.linalg.Vector) => prediction(1))

getProbability = UserDefinedFunction(<function1>,DoubleType,Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))


UserDefinedFunction(<function1>,DoubleType,Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))

In [13]:
predictionsDF.select(getProbability($"probability").alias("clean_probability")).show

+-------------------+
|  clean_probability|
+-------------------+
| 0.5566797309787749|
|0.49405393851992124|
|0.46813742530595104|
|0.44975655685427124|
|0.48519864266191276|
| 0.5110175201989817|
| 0.4991062801436906|
| 0.5262699569394153|
| 0.5110175201989817|
| 0.5046647510532778|
| 0.5110175201989817|
|0.44975655685427124|
| 0.4907178679359401|
| 0.5698226779793351|
| 0.4747979735735111|
| 0.5193876886265103|
| 0.5046647510532778|
| 0.5110175201989817|
| 0.5338348611230203|
| 0.5436126615851099|
+-------------------+
only showing top 20 rows

