<a href="https://colab.research.google.com/github/Nik8x/Pyspark/blob/master/Hospital_Charges_Data_Analysis/Hospital_Charges_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.3.0/spark-2.3.0-bin-hadoop2.7.tgz
!tar xf spark-2.3.0-bin-hadoop2.7.tgz
!pip install -q findspark
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/f2/64/a1df4440483df47381bbbf6a03119ef66515cf2e1a766d9369811575454b/pyspark-2.4.1.tar.gz (215.7MB)
[K    100% |████████████████████████████████| 215.7MB 123kB/s 
[?25hCollecting py4j==0.10.7 (from pyspark)
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K    100% |████████████████████████████████| 204kB 27.2MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/47/9b/57/7984bf19763749a13eece44c3174adb6ae4bc95b920375ff50
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.1


In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.0-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

sc = spark.sparkContext

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pyspark.sql.functions import col
from pyspark.sql import Row

from pyspark.sql import SQLContext
sql_context = SQLContext(sc)

In [0]:
df = spark.read.format("csv").options(header="true").load("inpatientCharges.csv")

In [7]:
df.printSchema()

root
 |-- DRGDefinition: string (nullable = true)
 |-- ProviderId: string (nullable = true)
 |-- ProviderName: string (nullable = true)
 |-- ProviderStreetAddress: string (nullable = true)
 |-- ProviderCity: string (nullable = true)
 |-- ProviderState: string (nullable = true)
 |-- ProviderZipCode: string (nullable = true)
 |-- HospitalReferralRegionDescription: string (nullable = true)
 |-- TotalDischarges: string (nullable = true)
 |-- AverageCoveredCharges: string (nullable = true)
 |-- AverageTotalPayments: string (nullable = true)
 |-- AverageMedicarePayments: string (nullable = true)



In [24]:
df = df.select(df.DRGDefinition, df.ProviderId.cast("int"), df.ProviderName, df.ProviderStreetAddress, df.ProviderCity, df.ProviderState, 
          df.ProviderZipCode.cast("int"), df.HospitalReferralRegionDescription, df.TotalDischarges.cast("int"),  df.AverageCoveredCharges.cast('float'),
         df.AverageTotalPayments.cast("float"), df.AverageMedicarePayments.cast("float"))
df.printSchema()

root
 |-- DRGDefinition: string (nullable = true)
 |-- ProviderId: integer (nullable = true)
 |-- ProviderName: string (nullable = true)
 |-- ProviderStreetAddress: string (nullable = true)
 |-- ProviderCity: string (nullable = true)
 |-- ProviderState: string (nullable = true)
 |-- ProviderZipCode: integer (nullable = true)
 |-- HospitalReferralRegionDescription: string (nullable = true)
 |-- TotalDischarges: integer (nullable = true)
 |-- AverageCoveredCharges: float (nullable = true)
 |-- AverageTotalPayments: float (nullable = true)
 |-- AverageMedicarePayments: float (nullable = true)



In [25]:
df.show(5)

+--------------------+----------+--------------------+---------------------+------------+-------------+---------------+---------------------------------+---------------+---------------------+--------------------+-----------------------+
|       DRGDefinition|ProviderId|        ProviderName|ProviderStreetAddress|ProviderCity|ProviderState|ProviderZipCode|HospitalReferralRegionDescription|TotalDischarges|AverageCoveredCharges|AverageTotalPayments|AverageMedicarePayments|
+--------------------+----------+--------------------+---------------------+------------+-------------+---------------+---------------------------------+---------------+---------------------+--------------------+-----------------------+
|039 - EXTRACRANIA...|     10001|SOUTHEAST ALABAMA...| 1108 ROSS CLARK C...|      DOTHAN|           AL|          36301|                      AL - Dothan|             91|             32963.07|             5777.24|                4763.73|
|039 - EXTRACRANIA...|     10005|MARSHALL MEDICAL ..

In [26]:
df.count()

163065

In [0]:
df.registerTempTable('hospital_charges') # save the data in a table by registering it in a temp table

In [29]:
df.groupBy("ProviderState").avg("AverageCoveredCharges").show(10) # to find the amount of Average Covered Charges per state.

+-------------+--------------------------+
|ProviderState|avg(AverageCoveredCharges)|
+-------------+--------------------------+
|           AZ|         41200.06305027978|
|           SC|        35862.494556530175|
|           LA|        33085.372762891144|
|           MN|        27894.361835703334|
|           NJ|          66125.6862605791|
|           DC|         40116.66361649418|
|           OR|         27390.11185211581|
|           VA|         29222.00051984144|
|           RI|        29942.701053956738|
|           KY|        24523.807144935305|
+-------------+--------------------------+
only showing top 10 rows



In [30]:
df.groupBy("ProviderState").avg("AverageTotalPayments").show(10) # the amount of Average Medicare Payments charges per state.

+-------------+-------------------------+
|ProviderState|avg(AverageTotalPayments)|
+-------------+-------------------------+
|           AZ|       10154.528211900704|
|           SC|        9132.420751970276|
|           LA|        8638.662569588547|
|           MN|        9948.236957123183|
|           NJ|       10678.988641289126|
|           DC|       12998.029389880952|
|           OR|       10436.192851923355|
|           VA|          8887.7521767515|
|           RI|       10509.566857993197|
|           KY|        8278.588842161007|
+-------------+-------------------------+
only showing top 10 rows



In [31]:
df.groupBy(("ProviderState"),("DRGDefinition")).sum("TotalDischarges").show(10) # the total number of Discharges per state and for each disease.

+-------------+--------------------+--------------------+
|ProviderState|       DRGDefinition|sum(TotalDischarges)|
+-------------+--------------------+--------------------+
|           KY|065 - INTRACRANIA...|                1937|
|           NY|101 - SEIZURES W/...|                4503|
|           IN|149 - DYSEQUILIBRIUM|                 700|
|           IA|178 - RESPIRATORY...|                 540|
|           WI|202 - BRONCHITIS ...|                 338|
|           MO|208 - RESPIRATORY...|                1840|
|           WI|251 - PERC CARDIO...|                 417|
|           AR|281 - ACUTE MYOCA...|                 413|
|           AZ|292 - HEART FAILU...|                2643|
|           NY|292 - HEART FAILU...|               13289|
+-------------+--------------------+--------------------+
only showing top 10 rows



In [0]:
from pyspark.sql.functions import desc
from pyspark.sql.functions import sum as _sum

In [44]:
df.groupBy(("ProviderState"),("DRGDefinition")).sum("TotalDischarges").sort(desc(_sum("TotalDischarges").toString)).show(10)

Py4JError: ignored

In [43]:
df.groupBy(("ProviderState"),("DRGDefinition")).sum("TotalDischarges").orderBy(desc(_sum("TotalDischarges").toString)).show

Py4JError: ignored