In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [3]:
spark = SparkSession.builder.getOrCreate()

In [6]:
visits = spark.read.json('visit.json')

In [7]:
visits.show()

+-------------------+------+-----+------------+-----------+
|               date|gender|icd10|patient_type|subdistrict|
+-------------------+------+-----+------------+-----------+
|2020-03-12T00:00:00|     F|C01.2|         OPD|      92001|
|2020-03-11T00:00:00|     M|C01.2|         IPD|      92003|
|2020-03-03T00:00:00|     M|C01.2|         IPD|      92002|
|2020-03-14T00:00:00|     F|A01.1|         IPD|      92001|
|2020-03-01T00:00:00|     M|B01.3|         OPD|      92001|
|2020-03-02T00:00:00|     M|C01.2|         OPD|      92001|
|2020-03-14T00:00:00|     M|C01.2|         IPD|      92003|
|2020-03-16T00:00:00|     F|B01.3|         IPD|      92001|
|2020-03-11T00:00:00|     F|C01.2|         OPD|      92002|
|2020-03-12T00:00:00|     F|B01.3|         IPD|      92001|
|2020-03-04T00:00:00|     M|A01.1|         OPD|      92003|
|2020-03-14T00:00:00|     F|C01.2|         OPD|      92002|
|2020-03-02T00:00:00|     F|C01.2|         IPD|      92003|
|2020-03-03T00:00:00|     F|C01.2|      

In [9]:
type(visits)

pyspark.sql.dataframe.DataFrame

In [16]:
# transform column with spark udf
# using .withcolumn
# read more from https://towardsdatascience.com/5-ways-to-add-a-new-column-in-a-pyspark-dataframe-4e75c2fd8c08

import pyspark.sql.functions as F
from pyspark.sql.types import *

def flu(icd10):
    if (icd10 == 'B01.3'):
        return 1
    else:
        return 0

fluFn = F.udf(flu, IntegerType())

visits.withColumn("flu", fluFn("icd10")).show()

+-------------------+------+-----+------------+-----------+---+
|               date|gender|icd10|patient_type|subdistrict|flu|
+-------------------+------+-----+------------+-----------+---+
|2020-03-12T00:00:00|     F|C01.2|         OPD|      92001|  0|
|2020-03-11T00:00:00|     M|C01.2|         IPD|      92003|  0|
|2020-03-03T00:00:00|     M|C01.2|         IPD|      92002|  0|
|2020-03-14T00:00:00|     F|A01.1|         IPD|      92001|  0|
|2020-03-01T00:00:00|     M|B01.3|         OPD|      92001|  1|
|2020-03-02T00:00:00|     M|C01.2|         OPD|      92001|  0|
|2020-03-14T00:00:00|     M|C01.2|         IPD|      92003|  0|
|2020-03-16T00:00:00|     F|B01.3|         IPD|      92001|  1|
|2020-03-11T00:00:00|     F|C01.2|         OPD|      92002|  0|
|2020-03-12T00:00:00|     F|B01.3|         IPD|      92001|  1|
|2020-03-04T00:00:00|     M|A01.1|         OPD|      92003|  0|
|2020-03-14T00:00:00|     F|C01.2|         OPD|      92002|  0|
|2020-03-02T00:00:00|     F|C01.2|      

In [22]:
# transform date column from string to datetime
# read more on https://stackoverflow.com/questions/38080748/convert-pyspark-string-to-date-format

import dateutil.parser

def dt(d):
    return dateutil.parser.parse(d)

dtFn = F.udf(dt, DateType())
visits.withColumn("date", dtFn("date")).show()

+----------+------+-----+------------+-----------+
|      date|gender|icd10|patient_type|subdistrict|
+----------+------+-----+------------+-----------+
|2020-03-12|     F|C01.2|         OPD|      92001|
|2020-03-11|     M|C01.2|         IPD|      92003|
|2020-03-03|     M|C01.2|         IPD|      92002|
|2020-03-14|     F|A01.1|         IPD|      92001|
|2020-03-01|     M|B01.3|         OPD|      92001|
|2020-03-02|     M|C01.2|         OPD|      92001|
|2020-03-14|     M|C01.2|         IPD|      92003|
|2020-03-16|     F|B01.3|         IPD|      92001|
|2020-03-11|     F|C01.2|         OPD|      92002|
|2020-03-12|     F|B01.3|         IPD|      92001|
|2020-03-04|     M|A01.1|         OPD|      92003|
|2020-03-14|     F|C01.2|         OPD|      92002|
|2020-03-02|     F|C01.2|         IPD|      92003|
|2020-03-03|     F|C01.2|         OPD|      92003|
|2020-03-16|     F|C01.2|         IPD|      92001|
|2020-03-15|     F|A01.1|         IPD|      92001|
|2020-03-16|     F|B01.3|      