In [1]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [2]:
import org.apache.spark.sql.types.{DateType, IntegerType, StringType, StructField, StructType}

val dataSchema = new StructType(Array(
    new StructField("HN", StringType, false),
    new StructField("DISEASE", IntegerType, false),
    new StructField("DATEDEFINE", DateType, false),
    new StructField("TYPE", IntegerType, false),
    new StructField("ICD10", StringType, false),
    new StructField("PATIENT_LOCATION_CODE", StringType, false)
  ));

dataSchema = StructType(StructField(HN,StringType,false), StructField(DISEASE,IntegerType,false), StructField(DATEDEFINE,DateType,false), StructField(TYPE,IntegerType,false), StructField(ICD10,StringType,false), StructField(PATIENT_LOCATION_CODE,StringType,false))


StructType(StructField(HN,StringType,false), StructField(DISEASE,IntegerType,false), StructField(DATEDEFINE,DateType,false), StructField(TYPE,IntegerType,false), StructField(ICD10,StringType,false), StructField(PATIENT_LOCATION_CODE,StringType,false))

In [3]:
val source = spark.read.format("csv")
  .option("mode", "FAILFAST")
  .option("inferSchema", "true")
  .option("header", "true")
  .schema(dataSchema)
  .option("path", "./hos_visits.csv")
  .load()

source = [HN: string, DISEASE: int ... 4 more fields]


[HN: string, DISEASE: int ... 4 more fields]

In [4]:
import org.apache.spark.sql.functions._
import scala.collection.mutable


val flu = udf((x: mutable.WrappedArray[String]) => {
    if (x.exists((s) => s.startsWith("J10") || s.startsWith("J11"))) 1 else 0
})

val pneumonia = udf((ary: mutable.WrappedArray[String]) => {
    if (ary.exists(s => s.matches(raw"^J1[2345678]") || s.startsWith("J85"))) 1 else 0
  })

val ili = udf((ary: mutable.WrappedArray[String], patientType: Int) => {
    if (patientType == 1) { // OPD only
      //J00, J02.9, J06.9, J09,J10,J11
      if (ary.exists(s => s.matches(raw"^(J00|J029|J069|J09|J10|J11)"))) 1 else 0
    } else 0
  })

val sari = udf((ary: mutable.WrappedArray[String], patientType: Int) => {
    if (patientType == 2) { // IPD only
      // J00-J22
      if (ary.exists(s => s.matches(raw"^J[01][0-9]") || s.matches(raw"^J2[0-2]"))) 1 else 0
    } else 0
  })

val dfd = source
    .withColumn("ICD10", split(col("ICD10"), "\\|"))
    .withColumn("flu", flu(col("ICD10")))
    .withColumn("pnuemonia", pneumonia(col("ICD10")))
    .withColumn("ili", ili(col("ICD10"), col("TYPE")))
    .withColumn("sari", sari(col("ICD10"), col("TYPE")))

flu = UserDefinedFunction(<function1>,IntegerType,Some(List(ArrayType(StringType,true))))
pneumonia = UserDefinedFunction(<function1>,IntegerType,Some(List(ArrayType(StringType,true))))
ili = UserDefinedFunction(<function2>,IntegerType,Some(List(ArrayType(StringType,true), IntegerType)))
sari = UserDefinedFunction(<function2>,IntegerType,Some(List(ArrayType(StringType,true), IntegerType)))
dfd = [HN: string, DISEASE: int ... 8 more fields]


[HN: string, DISEASE: int ... 8 more fields]

In [5]:
dfd.show(1)

+---------+-------+----------+----+------------------+---------------------+---+---------+---+----+
|       HN|DISEASE|DATEDEFINE|TYPE|             ICD10|PATIENT_LOCATION_CODE|flu|pnuemonia|ili|sari|
+---------+-------+----------+----+------------------+---------------------+---+---------+---+----+
|000019279|     26|2020-01-01|   1|[A919, , , , , , ]|               130608|  0|        0|  0|   0|
+---------+-------+----------+----+------------------+---------------------+---+---------+---+----+
only showing top 1 row



In [6]:
val summary = dfd.groupBy($"DATEDEFINE", $"PATIENT_LOCATION_CODE")
    .agg(sum($"flu") as "flu"
         ,sum($"ili") as "ILI"
         ,sum($"sari") as "SARI"
         ,sum($"pnuemonia") as "pnuemonia")
    

summary = [DATEDEFINE: date, PATIENT_LOCATION_CODE: string ... 4 more fields]


[DATEDEFINE: date, PATIENT_LOCATION_CODE: string ... 4 more fields]

In [7]:
summary.show()

+----------+---------------------+---+---+----+---------+
|DATEDEFINE|PATIENT_LOCATION_CODE|flu|ILI|SARI|pnuemonia|
+----------+---------------------+---+---+----+---------+
|2020-01-02|               104601|  1|  0|   0|        0|
|2020-01-02|               130601|  1|  0|   0|        0|
|2020-01-03|               130605|  0|  0|   0|        0|
|2020-01-03|               130607|  0|  0|   0|        0|
|2020-01-05|               130603|  0|  0|   0|        0|
|2020-01-05|               130607|  0|  0|   0|        0|
|2020-01-06|               130603|  0|  0|   0|        0|
|2020-01-06|               170106|  0|  0|   0|        0|
|2020-01-06|               451602|  0|  0|   0|        0|
|2020-01-07|               130606|  0|  0|   0|        0|
|2020-01-07|               130604|  0|  0|   0|        0|
|2020-01-07|               670603|  0|  0|   0|        0|
|2020-01-09|               130305|  1|  0|   0|        0|
|2020-01-09|               130601|  0|  0|   0|        0|
|2020-01-10|  

In [10]:
summary.repartition(1)
    .write
    .format("csv")
    .mode("overwrite")
    .option("path", "./summary_single.csv")
    .option("header", "TRUE")
    .save()