In [8]:
%%init_spark
launcher.master = "yarn"
launcher.conf.spark.app.name = "regex "
launcher.conf.spark.yarn.queue="root.analyst.editor-bu"
launcher.conf.spark.executor.cores=5
launcher.conf.spark.executor.memory="15g"
launcher.conf.spark.driver.memory="10g"
launcher.conf.spark.dynamicAllocation.enabled="true"
launcher.conf.spark.shuffle.service.enabled="true"
launcher.conf.spark.dynamicAllocation.maxExecutors=50
launcher.jars=["/opt/shared/postgresql_1.jar"]
launcher.conf.spark.sql.shuffle.partitions=250
launcher.conf.spark.serializer="org.apache.spark.serializer.KryoSerializer"
launcher.conf.spark.yarn.executor.memoryOverhead="5120m"

In [9]:
import org.joda.time.format.DateTimeFormat
import java.util.Properties
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.joda.time.{DateTime, Days}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.SaveMode._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.json4s._
import org.json4s.jackson.JsonMethods._
import org.joda.time.format.DateTimeFormat
import org.joda.time.{DateTime, Days}
val formatter = DateTimeFormat.forPattern("yyyy-MM-dd")

import scala.util.Try
import java.sql.{Connection, DriverManager, ResultSet}
val sparkSession = SparkSession.builder.master("local").appName("example").getOrCreate()
import sparkSession.implicits._      
import org.apache.spark.SparkContext  


val spark = SparkSession.builder().appName("test").getOrCreate()

val pathFormatter = DateTimeFormat.forPattern("yyyy/MM/dd")
val partitionFormatter = DateTimeFormat.forPattern("yyyy-MM-dd")

val props = new Properties()
val JDBC_URL = "jdbc:postgresql://172.16.33.44:5432/dwh"

props.setProperty("driver", "org.postgresql.Driver")
props.setProperty("max_connections", "10000")
props.setProperty("user", "dwh")
props.setProperty("password", "4F51hnXVMZoDcHrLvf")
props.setProperty("loginTimeout", "30")
props.setProperty("socketTimeout", "1800")


val sqlContext = spark


def publish(df:DataFrame, table:String, append:Boolean): Unit = {
    val conn = DriverManager.getConnection("jdbc:postgresql://172.16.33.44:5432/dwh?user=dwh&password=4F51hnXVMZoDcHrLvf")
    try {
        val mode = if(append) Append else Overwrite
        df.write.mode(mode).jdbc(JDBC_URL, table, props)
    }
    catch {
        case e: Exception => 
        e.printStackTrace()
    }
    finally {
        conn.close
    }
}



def getBigTable(from:DateTime, to:DateTime) : DataFrame = {
  val days = Days.daysBetween(from.withTimeAtStartOfDay(), to.withTimeAtStartOfDay()).getDays
  val files = (0 to days).map { d=>
    val dateStr = from.plusDays(d).toString(partitionFormatter)
    s"/analytics/big-table/partition_date=$dateStr"
  }
  spark.read.parquet(files:_*)
}

def getEvent(app:String, event:String, from:DateTime, to:DateTime, mergeSchemaOption: String = "false") : DataFrame = {
  val days = Days.daysBetween(from.withTimeAtStartOfDay(), to.withTimeAtStartOfDay()).getDays
  val files = (0 to days).map { d=>
    val dateStr = from.plusDays(d).toString(pathFormatter)
    s"/analytics/events/PARQUET/mobile_events/$app/$dateStr/$event"
  }
  spark.read.option("mergeSchema", mergeSchemaOption).parquet(files:_*)
}

def getSocialEvent(from:DateTime, to:DateTime) : DataFrame = {
  val days = Days.daysBetween(from.withTimeAtStartOfDay(), to.withTimeAtStartOfDay()).getDays
  val files = (0 to days).map { d=>
    val dateStr = from.plusDays(d).toString(pathFormatter)
    s"/analytics/events/PARQUET/social_events/$dateStr/"
  }
  spark.read.parquet(files:_*)
}

def getActiveDevices(from:DateTime, to:DateTime): DataFrame = {
  val days = Days.daysBetween(from.withTimeAtStartOfDay(), to.withTimeAtStartOfDay()).getDays
  val files = (0 to days).map { d=>
    val dateStr = from.plusDays(d).toString(pathFormatter)
    s"/analytics/events/PARQUET/mobile_devices/$dateStr/"
  }
  spark.read.parquet(files:_*)
}

def table(app:String, event:String, from:DateTime, to:DateTime) : Unit = {
  val days = Days.daysBetween(from.withTimeAtStartOfDay(), to.withTimeAtStartOfDay()).getDays
  val files = (0 to days).map { d=>
    val dateStr = from.plusDays(d).toString(pathFormatter)
    s"/analytics/events/PARQUET/mobile_events/$app/$dateStr/$event"
  }
  spark.read.parquet(files:_*).registerTempTable(event)
}

def getEntity(entity: String): DataFrame = {
  spark.read.parquet(s"/analytics/entities/$entity")
}

def getUsers() : DataFrame = {
  getEntity("users")
}

def getPhotos() : DataFrame = {
  getEntity("photos")
}

def getContests() : DataFrame = {
  getEntity("contests")
}

def getTags() : DataFrame = {
  getEntity("tags")
}

def getStreams() : DataFrame = {
  getEntity("streams")
}

def getDevices() : DataFrame = {
  getEntity("device_attributes")
}

def getRequests() : DataFrame = {
  getEntity("requests")
}

def getCommon(app:String, from:DateTime, to:DateTime) : DataFrame = {
  getEvent(app, "common", from, to)
}


def today(): DateTime = {
  new DateTime().withTimeAtStartOfDay()
}

def yesterday(): DateTime = {
  today().minusDays(1)
}

def getCommonLastNDays(app:String, days:Int) : org.apache.spark.sql.DataFrame = {
  getEvent(app, "common", today().minusDays(days), today())
}

def getEventLastNDays(app:String, event:String, days:Int) : DataFrame = {
  getEvent(app, event, today().minusDays(days), today())
}

def getStringParam(name:String, default:String): String = {
  Try(System.getenv(name)).getOrElse(default)
}

def getLongParam(name:String, default:Long): Long = {
  Try(System.getenv(name).toLong).getOrElse(default)
}

def getDateParam(name:String, default:DateTime): DateTime = {
  Try(pathFormatter.parseDateTime(System.getenv(name))).getOrElse(default)
}



def getMobileDevices(from:DateTime, to:DateTime): DataFrame = {
  val days = Days.daysBetween(from.withTimeAtStartOfDay(), to.withTimeAtStartOfDay()).getDays
  val files = (0 to days).map { d=>
    val dateStr = from.plusDays(d).toString(pathFormatter)
    s"/analytics/events/PARQUET/mobile_devices/$dateStr/"
  }
  spark.read.parquet(files:_*)
}



def union_events(event:Array[(String,String)], from:DateTime, to:DateTime) : DataFrame = {
  
  var union_base : org.apache.spark.sql.DataFrame = null
  var query:String=""
  
   for(d<-event)
{
    
     if (d._2!="")
     {query="where "+d._2}
    else
    query=""

    var second=getEvent("com.picsart.studio",d._1,from,to).registerTempTable("second")

    var final_second = spark.sql(s""" select * from second  $query """).
    select($"device_id",$"platform",to_date($"timestamp").as("date"),lower($"country_code").as("country_code"))  

if(union_base==null) 
    {union_base=final_second} 
else 
    {union_base=union_base.unionAll(final_second)}

}
return union_base
}
 
 
def erase_table(table_name:String, condition:String ="") {
   var JDBC_DRIVER = "org.postgresql.Driver";  
   var DB_URL = "jdbc:postgresql://172.16.33.44:5432/dwh";
   var USER = "dwh";
   var PASS = "4F51hnXVMZoDcHrLvf";
   var conn:java.sql.Connection = null;
   var stmt:java.sql.Statement = null;
   conn = java.sql.DriverManager.getConnection(DB_URL, USER, PASS);
   stmt = conn.createStatement();
   var sql:String = s"DELETE FROM $table_name" ;
   if (condition != "") { sql = sql+ " where " + condition}
   println(sql)
   stmt.executeUpdate(sql);
   stmt.close()
}

def getActive(from:DateTime, to:DateTime): DataFrame = {

 val days = Days.daysBetween(from.withTimeAtStartOfDay(), to.withTimeAtStartOfDay()).getDays
 var union_base : org.apache.spark.sql.DataFrame = null
(0 to days).map { d=>
val dateStr = from.plusDays(d).toString(pathFormatter) 
var aa=spark.read.parquet(s"/analytics/events/PARQUET/mobile_devices/$dateStr/").
filter($"app"==="com.picsart.studio").
select("device_id").distinct.
withColumn("date",lit(dateStr)).
withColumn("date", regexp_replace(col("date"), "/", "-")).
groupBy(to_date($"date").as("date")).agg(countDistinct("device_id"))

   

if(union_base==null) 
    {union_base=aa} 
else 
    {union_base=union_base.unionAll(aa)}

}
return union_base
}

import org.joda.time.format.DateTimeFormat
import java.util.Properties
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.joda.time.{DateTime, Days}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.SaveMode._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.json4s._
import org.json4s.jackson.JsonMethods._
import org.joda.time.format.DateTimeFormat
import org.joda.time.{DateTime, Days}
formatter: org.joda.time.format.DateTimeFormatter = org.joda.time.format.DateTimeFormatter@66263b58
import scala.util.Try
import java.sql.{Connection, DriverManager, ResultSet}
sparkSession: o...

In [13]:
import java.util.regex.Pattern

// val regex = "[^\\p{L}\\p{N}\\p{P}\\p{Z}\\n\\t\\r]"
// val regex = "(\\Q:)\\E|\\Q:D\\E|\\Q:(\\E|\\Q:wink:\\E)"
// val pattern = Pattern.compile(regex, Pattern.UNICODE_CASE)
// val matcher = pattern.matcher(" ")


val to = DateTime.parse("2019-10-12")
val from = DateTime.parse("2019-09-12")

val text = getEvent("com.picsart.studio", "edit_text_apply", from, to).
filter($"text_content".isin("❤️") && $"platform" === "android").
select($"text_content").distinct().
show(100,false)

+------------+
|text_content|
+------------+
|❤️          |
+------------+



import java.util.regex.Pattern
to: org.joda.time.DateTime = 2019-10-12T00:00:00.000Z
from: org.joda.time.DateTime = 2019-09-12T00:00:00.000Z
text: Unit = ()
