In [1]:
import ml.dmlc.xgboost4j.scala.spark.{XGBoostRegressor, XGBoostRegressionModel}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType}
import org.apache.spark.sql.SparkSession
import ml.dmlc.xgboost4j.scala.spark.rapids.{GpuDataReader, GpuDataset}

In [2]:
// Data paths
val trainPath = "/data/rank/csv/mq2008.train"
val testPath  = "/data/rank/csv/mq2008.test"

trainPath = /data/rank/csv/mq2008.train
testPath = /data/rank/csv/mq2008.test


/data/rank/csv/mq2008.test

In [3]:
// Training parameters
lazy val paramMap = Map(
  "objective" -> "rank:pairwise",
  "eta" -> 0.1,
  "min_child_weight" -> 0.1,
  "gamma" -> 1.0,
  "max_depth" -> 6,
  "num_round" -> 4,
  "missing" -> 0.0
)

paramMap = <lazy>


<lazy>

In [4]:
// Define column names and schema
val labelName = "label"
val groupName = "group"
def featureNames: Seq[String] = (0 until 46).map(i => s"feature_$i")

def schema: StructType = StructType(
  Seq(
    StructField(labelName, FloatType),
    StructField(groupName, IntegerType)
  ) ++ featureNames.map(name =>
    StructField(name, FloatType)
  )
)

labelName = label
groupName = group


featureNames: Seq[String]
schema: org.apache.spark.sql.types.StructType


group

In [5]:
// Create spark session and build XGBoost XGBoostRegressor
val spark = SparkSession.builder().appName("MQ2008-GPU").getOrCreate
// === diff ===
val reader = new GpuDataReader(spark).schema(schema).option("header", false)
// === diff ===
val (trainSet, testSet) = (reader.csv(trainPath), reader.csv(testPath))

val xgbParamFinal = paramMap ++ Map("tree_method" -> "gpu_hist", "num_workers" -> 1)
val xgbRegressor = new XGBoostRegressor(xgbParamFinal)
  .setLabelCol(labelName)
  .setGroupCol(groupName)
  // === diff ===
  .setFeaturesCols(featureNames)

spark = org.apache.spark.sql.SparkSession@39b7b2e7
reader = ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataReader@58903ce0
trainSet = ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataset@4a9d2d21
testSet = ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataset@14470417
xgbParamFinal = Map(min_child_weight -> 0.1, num_workers -> 1, max_depth -> 6, objective -> rank:pairwise, num_round -> 4, missing -> 0.0, tree_method -> gpu_hist, eta -> 0.1, gamma -> 1.0)
xgbRegressor = xgbr_501baa77765b


xgbr_501baa77765b

In [6]:
object Benchmark {
  def time[R](phase: String)(block: => R): (R, Float) = {
    val t0 = System.currentTimeMillis
    val result = block // call-by-name
    val t1 = System.currentTimeMillis
    println("Elapsed time [" + phase + "]: " + ((t1 - t0).toFloat / 1000) + "s")
    (result, (t1 - t0).toFloat / 1000)
  }
}

// start training to rank
val (model, _) = Benchmark.time("train") {
  xgbRegressor.fit(trainSet)
}

Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.19.183.78, DMLC_TRACKER_PORT=9092, DMLC_NUM_WORKER=1}
Elapsed time [train]: 5.889s


defined object Benchmark
model = xgbr_501baa77765b


xgbr_501baa77765b

In [7]:
// start transform and evaluation
val (prediction, _) = Benchmark.time("transform") {
  val ret = model.transform(testSet).cache()
  ret.foreachPartition(_ => ())
  ret
}
prediction.select("label", "feature_0", "feature_1", "feature_2", "prediction").show(10)

val evaluator = new RegressionEvaluator().setLabelCol(labelName)
val (rmse, _) = Benchmark.time("evaluation") {
  evaluator.evaluate(prediction)
}
println(s"RMSE == $rmse")

Elapsed time [transform]: 2.459s
+-----+-----------+---------+----------+-------------------+
|label|  feature_0|feature_1| feature_2|         prediction|
+-----+-----------+---------+----------+-------------------+
|  0.0|0.052892998|      1.0|      0.75| 0.5653607249259949|
|  0.0|   0.004959|      0.0|      0.25| 0.1805700957775116|
|  0.0|   0.066116|     0.75|      0.25| 0.7334920167922974|
|  1.0|0.026445998|     0.75|      0.75| 0.4991956353187561|
|  0.0|0.029752001|      0.0|       1.0| 0.5998898148536682|
|  0.0|   0.066116|      0.0|      0.25| 0.6417648792266846|
|  0.0|        0.0|      0.0|      0.25| 0.1651393175125122|
|  0.0|        1.0|      0.0|       0.0| 0.3046388328075409|
|  0.0|   0.003865| 0.142857|0.33333302|0.23630934953689575|
|  0.0|   0.008835| 0.142857|0.33333302| 0.2636098563671112|
+-----+-----------+---------+----------+-------------------+
only showing top 10 rows

Elapsed time [evaluation]: 0.131s
RMSE == 0.5501281048266939


prediction = [label: float, group: float ... 47 more fields]
evaluator = regEval_2284e7141b4a
rmse = 0.5501281048266939


0.5501281048266939

In [8]:
// model save and load
model.write.overwrite.save("/data/model/rank/gpu")

val modelFromDisk = XGBoostRegressionModel.load("/data/model/rank/gpu")
val (prediction2, _) = Benchmark.time("transform2") {
  val ret = modelFromDisk.transform(testSet)
  ret.foreachPartition(_ => ())
  ret
}
prediction2.select("label", "feature_0", "feature_1", "feature_2", "prediction").show(10)

Elapsed time [transform2]: 0.173s
+-----+-----------+---------+----------+-------------------+
|label|  feature_0|feature_1| feature_2|         prediction|
+-----+-----------+---------+----------+-------------------+
|  0.0|0.052892998|      1.0|      0.75| 0.5653607249259949|
|  0.0|   0.004959|      0.0|      0.25| 0.1805700957775116|
|  0.0|   0.066116|     0.75|      0.25| 0.7334920167922974|
|  1.0|0.026445998|     0.75|      0.75| 0.4991956353187561|
|  0.0|0.029752001|      0.0|       1.0| 0.5998898148536682|
|  0.0|   0.066116|      0.0|      0.25| 0.6417648792266846|
|  0.0|        0.0|      0.0|      0.25| 0.1651393175125122|
|  0.0|        1.0|      0.0|       0.0| 0.3046388328075409|
|  0.0|   0.003865| 0.142857|0.33333302|0.23630934953689575|
|  0.0|   0.008835| 0.142857|0.33333302| 0.2636098563671112|
+-----+-----------+---------+----------+-------------------+
only showing top 10 rows



modelFromDisk = xgbr_501baa77765b
prediction2 = [label: float, group: float ... 47 more fields]


[label: float, group: float ... 47 more fields]

In [9]:
spark.close()