In [None]:
import org.apache.spark.sql.types.{StringType, StructType, BooleanType}

private val SCHEMA = (new StructType)
    .add("@type", StringType)
    .add("device", (new StructType)
      .add("deviceType", StringType)
    )
    .add("object", (new StructType) 
      .add("@id", StringType) 
      .add("@type", StringType) 
      .add("attachments", StringType)
      .add("body", StringType)
      .add("inReplyTo", StringType)
      .add("isFirstMessage", BooleanType)
      .add("origin", StringType)
      .add("receiverConversationId", StringType)
      .add("senderConversationId", StringType)
      .add("subject", StringType)
      .add("messageType", StringType)
      .add("publisher", (new StructType)
          .add("@id", StringType)
      )
    )
    .add("provider", (new StructType)
      .add("@type", StringType)
      .add("component", StringType)
      .add("productType", StringType)
    )
    .add("tracker", (new StructType)
      .add("type", StringType)
      .add("version", StringType)
    )
    .add("account", (new StructType)
      .add("accountId", StringType)
    )
    .add("target", (new StructType)
      .add("accountId", StringType)
    )
    .add("schema", StringType)
    .add("@id", StringType)
    .add("published",StringType)

val events2 = spark.read.json("s3://schibsted-spt-common-prod/purple/messaging/client=subito/version=1/year=2017/month=10/day=24/hour=10")
val events = spark.read.schema(SCHEMA).json("s3://schibsted-spt-common-prod/purple/messaging/client=subito/version=1/year=2017/month=10/day=24/hour=10")


events.cache

events2.printSchema

In [None]:
events.printSchema()
events.show(10,false)


In [None]:
import org.apache.spark.sql.functions._

val normalizeCategory = (category: String) => {
    category.split(">").last.toLowerCase.trim
      .replaceAll(" ", "-")
      .replaceAll(",-", "-")
      .replaceAll(",", "-")
      .replaceAll("/", "-")
}

val normalizeMacroCategory = (category: String) => {
    category.split(">")(0).toLowerCase.trim
      .replaceAll(" ", "-")
      .replaceAll(",-", "-")
      .replaceAll(",", "-")
      .replaceAll("/", "-")
}

val normalizeRegion = (locality: String) => {
    locality.split(">")(0).trim
}

val provinceAndCity = (region: String, locality: String) => {
    locality.split(">")(0).trim
}

private val selectAdID = (value: String) => {
    ":ad(.*)list".r.findFirstIn(value.toLowerCase()).get.replace(":ad","").replace("list","")
}

private val selectListID = (value: String) => {
    value.toLowerCase()
      .replace("sdrn:schibsted:classified:", "").replace("sdrn:regresssubito:classified:", "")
      .replace("sdrn:subito:classified:","").replace("sdrn:com.subito.subito:classified:", "")
      .replace("sdrn:subito:phonecontact:","")
      .replace("id%3aad%3a", "").replace("id:ad:","")
      .replace("%3alist%3a","-").replace(":list:", "-")
      .split("-")(1).trim
}

spark.udf.register("normalize_category", normalizeCategory)
spark.udf.register("normalize_macro_category", normalizeMacroCategory)
spark.udf.register("normalize_region", normalizeRegion)
spark.udf.register("select_ad_id", selectAdID)
spark.udf.register("select_list_id", selectListID)

val messages = events.
    //filter(($"@type" === "Launch")).
    select(col("@type").alias("status"),
           col("device.devicetype").alias("device"),
           col("object.@type").alias("type"),
           col("object.attachments").alias("attachments"),
           col("object.body").alias("body"),
           col("object.inReplyTo").alias("adId"),
           col("object.isFirstMessage").alias("isFirstMessage"),
           col("object.origin").alias("origin"),
           col("object.receiverConversationId").alias("receiverConversationId"),
           col("object.senderConversationId").alias("senderConversationId"),
           col("object.subject").alias("subject"),
           col("account.accountId").alias("sender_userId"),
           col("target.accountId").alias("recipient_userId"),
           col("published")).
    cache

messages.createOrReplaceTempView("messages")

In [None]:
val data = messages.
    sqlContext.sql("SELECT messagetype, senderConversationId from messages ").
    cache.show(1000,false)

In [None]:
val data = messages.
    sqlContext.sql("SELECT status, device, type, isFirstMessage, origin, category, count(*) from messages group by status, device, type, isFirstMessage, origin, category order by device, origin").
    cache


data.show(100, false)

In [None]:
val data = messages.
    sqlContext.sql("SELECT select_ad_id(adId) , count(*) from messages group by adId order by count(*) desc").
    cache


data.show(1000, false)

In [None]:
val data = messages.
    sqlContext.sql("SELECT * from  messages limit 1 ").
    cache


data.show(1000, false)