src/main/scala/com/samelamin/spark/bigquery/BigQueryDataFrame.scala

package com.samelamin.spark.bigquery

import com.google.api.services.bigquery.model.{TableReference, TableSchema}
import com.google.cloud.hadoop.io.bigquery._
import com.google.gson._
import com.samelamin.spark.bigquery.converters.{BigQueryAdapter, SchemaConverters}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.{LongWritable, NullWritable}
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
import org.apache.spark.sql.DataFrame
import org.slf4j.LoggerFactory

import scala.util.Random
/**
  * Created by samelamin on 12/08/2017.
  */
class BigQueryDataFrame(self: DataFrame) extends Serializable {
  val adaptedDf = BigQueryAdapter(self)
  private val logger = LoggerFactory.getLogger(classOf[BigQueryClient])

  @transient
  lazy val hadoopConf = self.sparkSession.sparkContext.hadoopConfiguration
  lazy val bq = BigQueryClient.getInstance(self.sqlContext)

  @transient
  lazy val jsonParser = new JsonParser()

  /**
    * Save DataFrame data into BigQuery table using Hadoop writer API
    *
    * @param fullyQualifiedOutputTableId output-table id of the form
    *                                    [optional projectId]:[datasetId].[tableId]
    * @param isPartitionedByDay partion the table by day
    */
  def saveAsBigQueryTable(fullyQualifiedOutputTableId: String,
                          isPartitionedByDay: Boolean = false,
                          timePartitionExpiration: Long = 0,
                          writeDisposition: WriteDisposition.Value = null,
                          createDisposition: CreateDisposition.Value = null): Unit = {
    val destinationTable = BigQueryStrings.parseTableReference(fullyQualifiedOutputTableId)
    val bigQuerySchema = SchemaConverters.SqlToBQSchema(adaptedDf)
    val gcsPath = writeDFToGoogleStorage(adaptedDf,destinationTable,bigQuerySchema)
    bq.load(destinationTable,
      bigQuerySchema,
      gcsPath,
      isPartitionedByDay,
      timePartitionExpiration,
      writeDisposition,
      createDisposition)
    delete(new Path(gcsPath))
  }

  def writeDFToGoogleStorage(adaptedDf: DataFrame,
                             destinationTable: TableReference,
                             bqSchema: TableSchema): String = {
    val tableName = BigQueryStrings.toString(destinationTable)

    BigQueryConfiguration.configureBigQueryOutput(hadoopConf, tableName, bqSchema.toPrettyString())
    hadoopConf.set("mapreduce.job.outputformat.class", classOf[BigQueryOutputFormat[_, _]].getName)
    val bucket = self.sparkSession.conf.get(BigQueryConfiguration.GCS_BUCKET_KEY)
    val temp = s"spark-bigquery-${System.currentTimeMillis()}=${Random.nextInt(Int.MaxValue)}"
    val gcsPath = s"gs://$bucket/hadoop/tmp/spark-bigquery/$temp"
    if(hadoopConf.get(BigQueryConfiguration.TEMP_GCS_PATH_KEY) == null) {
      hadoopConf.set(BigQueryConfiguration.TEMP_GCS_PATH_KEY, gcsPath)
    }

    logger.info(s"Loading $gcsPath into $tableName")
    adaptedDf
      .toJSON
      .rdd
      .map(json => (null, jsonParser.parse(json)))
      .saveAsNewAPIHadoopFile(gcsPath,
        classOf[GsonBigQueryInputFormat],
        classOf[LongWritable],
        classOf[TextOutputFormat[NullWritable, JsonObject]],
        hadoopConf)
    gcsPath
  }


  private def delete(path: Path): Unit = {
    val fs = FileSystem.get(path.toUri, hadoopConf)
    fs.delete(path, true)
  }
}