In [16]:

import findspark
findspark.init()
import pyspark 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SparkFunctions").master("local[4]").enableHiveSupport().getOrCreate()

In [17]:

file_location = "C:/Users/Dinesh_2/Desktop/Patient_Dashboard_Active.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

df_vw = spark.read.format(file_type) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .option("encoding", "UTF-8") \
  .load(file_location)


In [18]:
df=df_vw.select('Patient','DOB','SSN','Address','Chart Status')
df.show()

+--------------------+----------+-----------+--------------------+--------------------+
|             Patient|       DOB|        SSN|             Address|        Chart Status|
+--------------------+----------+-----------+--------------------+--------------------+
|     AASGAARD, LINDA|05/14/1943|       null|1400 CIRCLE CITY ...|            Admitted|
|              Shuman|      null|       null|                null|                null|
|       ABARCA, RAMON|08/26/1942|551-66-3306|7120 CORBIN AVE R...|          Discharged|
|      ABBAS, ALIDAEE|04/03/1952|       null|13881 DAWSON ST G...|            Admitted|
|              MARKIE|      null|       null|                null|                null|
|      ABBOTT, CONNIE|09/17/1961|       null|5400 STINE ROAD B...|          Discharged|
|      ABBOTT, CONNIE|09/17/1961|       null|5400 STINE ROAD B...|          Discharged|
|    ABDOLI, MOHAMMAD|08/25/1948|443-74-4056|466 FLAGSHIP RD N...|           Non-admit|
|ABEL MEDRIGAL, DE...|12/13/1971

In [19]:
df.printSchema()

root
 |-- Patient: string (nullable = true)
 |-- DOB: string (nullable = true)
 |-- SSN: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Chart Status: string (nullable = true)



In [20]:
df.describe().show()

+-------+--------------------+--------------+--------------------+--------------------+---------------+
|summary|             Patient|           DOB|                 SSN|             Address|   Chart Status|
+-------+--------------------+--------------+--------------------+--------------------+---------------+
|  count|               13657|         11204|                5553|               11188|          11525|
|   mean|                null|          null| 4.905207640543478E8|                null|           null|
| stddev|                null|          null|1.6742654088538024E8|                null|           null|
|    min|"ABRAHAMIAN, KRIK...|    01/01/1916|          36-46-5011|	7324 Canby Ave R...|ARIAS, GABRIELA|
|    max|                x, x|Sanjari, Nadia|         XXX-XX-8917|undecided LOS ANG...|lacanilao, emma|
+-------+--------------------+--------------+--------------------+--------------------+---------------+



In [21]:
df = df.withColumnRenamed('DOB','Date of Birth').withColumnRenamed('Address','ADDR').withColumnRenamed('Chart Status','Chart_Status')
df.show()

+--------------------+-------------+-----------+--------------------+--------------------+
|             Patient|Date of Birth|        SSN|                ADDR|        Chart_Status|
+--------------------+-------------+-----------+--------------------+--------------------+
|     AASGAARD, LINDA|   05/14/1943|       null|1400 CIRCLE CITY ...|            Admitted|
|              Shuman|         null|       null|                null|                null|
|       ABARCA, RAMON|   08/26/1942|551-66-3306|7120 CORBIN AVE R...|          Discharged|
|      ABBAS, ALIDAEE|   04/03/1952|       null|13881 DAWSON ST G...|            Admitted|
|              MARKIE|         null|       null|                null|                null|
|      ABBOTT, CONNIE|   09/17/1961|       null|5400 STINE ROAD B...|          Discharged|
|      ABBOTT, CONNIE|   09/17/1961|       null|5400 STINE ROAD B...|          Discharged|
|    ABDOLI, MOHAMMAD|   08/25/1948|443-74-4056|466 FLAGSHIP RD N...|           Non-admit|

In [22]:
df = df.fillna({'ADDR':'NA', 'SSN' : 'NA'})
df.show()

+--------------------+-------------+-----------+--------------------+--------------------+
|             Patient|Date of Birth|        SSN|                ADDR|        Chart_Status|
+--------------------+-------------+-----------+--------------------+--------------------+
|     AASGAARD, LINDA|   05/14/1943|         NA|1400 CIRCLE CITY ...|            Admitted|
|              Shuman|         null|         NA|                  NA|                null|
|       ABARCA, RAMON|   08/26/1942|551-66-3306|7120 CORBIN AVE R...|          Discharged|
|      ABBAS, ALIDAEE|   04/03/1952|         NA|13881 DAWSON ST G...|            Admitted|
|              MARKIE|         null|         NA|                  NA|                null|
|      ABBOTT, CONNIE|   09/17/1961|         NA|5400 STINE ROAD B...|          Discharged|
|      ABBOTT, CONNIE|   09/17/1961|         NA|5400 STINE ROAD B...|          Discharged|
|    ABDOLI, MOHAMMAD|   08/25/1948|443-74-4056|466 FLAGSHIP RD N...|           Non-admit|

In [23]:
df=df.filter(df.Chart_Status!='Discharged')
df.show()

+--------------------+-------------+-----------+--------------------+--------------------+
|             Patient|Date of Birth|        SSN|                ADDR|        Chart_Status|
+--------------------+-------------+-----------+--------------------+--------------------+
|     AASGAARD, LINDA|   05/14/1943|         NA|1400 CIRCLE CITY ...|            Admitted|
|      ABBAS, ALIDAEE|   04/03/1952|         NA|13881 DAWSON ST G...|            Admitted|
|    ABDOLI, MOHAMMAD|   08/25/1948|443-74-4056|466 FLAGSHIP RD N...|           Non-admit|
|         ABLE, SHARI|   06/06/1939|  566488725|225 North Crescen...|            Admitted|
|Above & Beyond, F...|         null|         NA|                  NA|Transition TO CCTALW|
|   ABRAHAM, FLORENCE|   01/13/1937|554-58-4615|225 NORTH CRESCEN...|            Admitted|
|ABRAMS, JEAN MARI...|   10/31/1954|         NA|9925 LA ALAMEDA A...|            Admitted|
|     ABRAMSON, ALIZA|   06/23/1933|         NA|1250 BOYNTON ST G...|            Admitted|

In [28]:
df1=df.join(df.select('Patient','SSN','ADDR'),(df.Patient==df.Patient) & (df.SSN==df.SSN),how='fullouter')
df1.select('Date of Birth').show()

+-------------+
|Date of Birth|
+-------------+
|   01/11/1973|
|   12/04/1939|
|   10/03/1922|
|   03/06/1988|
|   05/30/1976|
|   10/03/1993|
|   11/11/1984|
|   08/12/1946|
|   03/23/1954|
|   05/31/1981|
|   04/10/1948|
|   05/25/1924|
|   05/25/1924|
|   05/25/1924|
|   05/25/1924|
|   04/18/1946|
|   01/03/1936|
|   05/19/1946|
|   12/29/1950|
|   04/10/1966|
+-------------+
only showing top 20 rows



In [29]:
from pyspark.sql import functions as F
help(F)

Help on module pyspark.sql.functions in pyspark.sql:

NAME
    pyspark.sql.functions - A collections of builtin functions

FUNCTIONS
    abs(col)
        Computes the absolute value.
        
        .. versionadded:: 1.3
    
    acos(col)
        :return: inverse cosine of `col`, as if computed by `java.lang.Math.acos()`
        
        .. versionadded:: 1.4
    
    add_months(start, months)
        Returns the date that is `months` months after `start`
        
        >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
        >>> df.select(add_months(df.dt, 1).alias('next_month')).collect()
        [Row(next_month=datetime.date(2015, 5, 8))]
        
        .. versionadded:: 1.5
    
    approxCountDistinct(col, rsd=None)
        .. note:: Deprecated in 2.1, use :func:`approx_count_distinct` instead.
        
        .. versionadded:: 1.3
    
    approx_count_distinct(col, rsd=None)
        Aggregate function: returns a new :class:`Column` for approximate distinct count 

In [30]:
df.groupBy('Chart_Status').count().show()

+--------------------+-----+
|        Chart_Status|count|
+--------------------+-----+
|  PALACIOS, MICHELLE|   10|
|           Non-admit| 1284|
|       WYNN, THERESA|    1|
|           Wait List|    7|
|          KIM, NAOMI|   20|
|       QUIROZ, DAISY|   33|
|           GO, DIANA|   20|
|Transition TO CCTALW|   90|
|     BERMAN, DEBORAH|   12|
|       Raiken, Susan|   22|
|       SHAHAM, YAFIT|    7|
|            Admitted| 2749|
|      BEMEL, MELANIE|    5|
|      GEWIRTZ, JENNA|    1|
|CCA Preadmit Slot...|    3|
|     ROJAS, JENNIFER|   13|
|    GUEVARA, NATALIE|    4|
|       REYES, MARCUS|    2|
|Transition To CCT...|   22|
|          NAVAL, ANA|    2|
+--------------------+-----+
only showing top 20 rows



In [31]:
df.groupBy('Chart_Status').agg(F.countDistinct('SSN').alias('CNT'),F.count('ADDR').alias('ADDR_CNT')).show()

+--------------------+---+--------+
|        Chart_Status|CNT|ADDR_CNT|
+--------------------+---+--------+
|  PALACIOS, MICHELLE|  1|      10|
|           Non-admit|803|    1284|
|       WYNN, THERESA|  1|       1|
|           Wait List|  4|       7|
|          KIM, NAOMI|  1|      20|
|       QUIROZ, DAISY|  1|      33|
|           GO, DIANA|  1|      20|
|Transition TO CCTALW| 78|      90|
|     BERMAN, DEBORAH|  1|      12|
|       Raiken, Susan|  1|      22|
|       SHAHAM, YAFIT|  1|       7|
|            Admitted|451|    2749|
|      BEMEL, MELANIE|  1|       5|
|      GEWIRTZ, JENNA|  1|       1|
|CCA Preadmit Slot...|  2|       3|
|    GUEVARA, NATALIE|  1|       4|
|     ROJAS, JENNIFER|  1|      13|
|       REYES, MARCUS|  1|       2|
|Transition To CCT...| 20|      22|
|          NAVAL, ANA|  1|       2|
+--------------------+---+--------+
only showing top 20 rows



In [32]:
df.agg(F.count(df.Chart_Status).alias('c')).collect()

[Row(c=4560)]

In [33]:
df.cube('Chart_Status').agg(F.grouping('Chart_Status'),F.count('Chart_Status')).orderBy('Chart_Status').show()

+--------------------+----------------------+-------------------+
|        Chart_Status|grouping(Chart_Status)|count(Chart_Status)|
+--------------------+----------------------+-------------------+
|                null|                     1|               4560|
|     ARIAS, GABRIELA|                     0|                  4|
|            Admitted|                     0|               2749|
|      BEMEL, MELANIE|                     0|                  5|
|     BERMAN, DEBORAH|                     0|                 12|
|        CCA Preadmit|                     0|                  1|
|CCA Preadmit Appl...|                     0|                  9|
|CCA Preadmit Appl...|                     0|                  6|
|CCA Preadmit Slot...|                     0|                  3|
|   CORTEZ, CELESTINA|                     0|                  2|
|      ESCANO, KARINA|                     0|                  2|
|      FIGUEROA, ELIA|                     0|                 24|
|      GEW

In [35]:

df.createOrReplaceTempView('df')

data12=spark.sql('select Chart_Status,count(*) from df group by Chart_Status')
data12.show()

+--------------------+--------+
|        Chart_Status|count(1)|
+--------------------+--------+
|  PALACIOS, MICHELLE|      10|
|           Non-admit|    1284|
|       WYNN, THERESA|       1|
|           Wait List|       7|
|          KIM, NAOMI|      20|
|       QUIROZ, DAISY|      33|
|           GO, DIANA|      20|
|Transition TO CCTALW|      90|
|     BERMAN, DEBORAH|      12|
|       Raiken, Susan|      22|
|       SHAHAM, YAFIT|       7|
|            Admitted|    2749|
|      BEMEL, MELANIE|       5|
|      GEWIRTZ, JENNA|       1|
|CCA Preadmit Slot...|       3|
|     ROJAS, JENNIFER|      13|
|    GUEVARA, NATALIE|       4|
|       REYES, MARCUS|       2|
|Transition To CCT...|      22|
|          NAVAL, ANA|       2|
+--------------------+--------+
only showing top 20 rows



In [36]:
df1= df.withColumn('ID',F.lit(1))
df1.show()

+--------------------+-------------+-----------+--------------------+--------------------+---+
|             Patient|Date of Birth|        SSN|                ADDR|        Chart_Status| ID|
+--------------------+-------------+-----------+--------------------+--------------------+---+
|     AASGAARD, LINDA|   05/14/1943|         NA|1400 CIRCLE CITY ...|            Admitted|  1|
|      ABBAS, ALIDAEE|   04/03/1952|         NA|13881 DAWSON ST G...|            Admitted|  1|
|    ABDOLI, MOHAMMAD|   08/25/1948|443-74-4056|466 FLAGSHIP RD N...|           Non-admit|  1|
|         ABLE, SHARI|   06/06/1939|  566488725|225 North Crescen...|            Admitted|  1|
|Above & Beyond, F...|         null|         NA|                  NA|Transition TO CCTALW|  1|
|   ABRAHAM, FLORENCE|   01/13/1937|554-58-4615|225 NORTH CRESCEN...|            Admitted|  1|
|ABRAMS, JEAN MARI...|   10/31/1954|         NA|9925 LA ALAMEDA A...|            Admitted|  1|
|     ABRAMSON, ALIZA|   06/23/1933|         NA|12

In [37]:
df1.printSchema()

root
 |-- Patient: string (nullable = true)
 |-- Date of Birth: string (nullable = true)
 |-- SSN: string (nullable = false)
 |-- ADDR: string (nullable = false)
 |-- Chart_Status: string (nullable = true)
 |-- ID: integer (nullable = false)



In [39]:
df1=df1.withColumn('ID',F.col('ID').cast('string'))
df1.printSchema()

root
 |-- Patient: string (nullable = true)
 |-- Date of Birth: string (nullable = true)
 |-- SSN: string (nullable = false)
 |-- ADDR: string (nullable = false)
 |-- Chart_Status: string (nullable = true)
 |-- ID: string (nullable = false)

