In [71]:
# Import Non Spark modules

import numpy as np

In [60]:
# Build Spark App
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('PySpark Hands-On').getOrCreate()

In [61]:
sc = spark.sparkContext

In [62]:
# Print Spark App ID
sc.applicationId

'local-1654084970703'

## Spark Read & Write Operations

#### Using Hadoop Api via Java Gateway

In [23]:
# Using the Java Gateway
URI = sc._gateway.jvm.java.net.URI
Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
fs = FileSystem.get(URI("hdfs://127.0.0.1:9000"), sc._jsc.hadoopConfiguration())

In [25]:
# Make sure to provide correct URI path which can be found in `fs.default.name` property of core-site.xml

In [24]:
fs.listStatus(Path('/user/madara/data'))

JavaObject id=o58

In [27]:
# fs.listFiles(Path('/user/madara/data'))

# Throwing error : py4j.Py4JException: Method listFiles([class org.apache.hadoop.fs.Path]) does not exist

In [None]:
# Get the HDFS 

#### Loading Data from HDFS

In [38]:
# sdf = spark.read.format('csv').load('hdfs://127.0.0.1:9000/user/madara/data/heartstroke/heartstroke.csv')

sdf = spark.read.csv('hdfs://127.0.0.1:9000/user/madara/data/heartstroke/heartstroke.csv', inferSchema=True,header=True)

In [63]:
sdf.printSchema()

root
 |-- id: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- ever_married: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- Residence_type: string (nullable = true)
 |-- avg_glucose_level: double (nullable = true)
 |-- bmi: string (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- stroke: integer (nullable = true)



In [40]:
sdf.describe()

DataFrame[summary: string, id: string, gender: string, age: string, hypertension: string, heart_disease: string, ever_married: string, work_type: string, Residence_type: string, avg_glucose_level: string, bmi: string, smoking_status: string, stroke: string]

#### Data Analysis

In [67]:
# Import modules for Analysis

from pyspark.sql import Row
import pyspark.sql.functions as F
from pyspark.sql.types import *

##### Group By

When we perform groupBy() on Spark Dataframe, it returns RelationalGroupedDataset object which contains below aggregate functions.`

In [64]:
sdf.groupBy('stroke').count().show()

+------+-----+
|stroke|count|
+------+-----+
|     1|  249|
|     0| 4861|
+------+-----+



In [45]:
sdf.groupBy('work_type').count().show()

+-------------+-----+
|    work_type|count|
+-------------+-----+
| Never_worked|   22|
|Self-employed|  819|
|      Private| 2925|
|     children|  687|
|     Govt_job|  657|
+-------------+-----+



In [49]:
sdf.groupBy('work_type').count().withColumnRenamed('count', 'work_type_count').show()

+-------------+---------------+
|    work_type|work_type_count|
+-------------+---------------+
| Never_worked|             22|
|Self-employed|            819|
|      Private|           2925|
|     children|            687|
|     Govt_job|            657|
+-------------+---------------+



In [94]:
gender_cnt_per = sdf.groupBy("gender") \
  .agg(count('gender').alias('gender_count')) \
  .withColumn('perc_of_count_total', F.round((F.col('gender_count') / sdf.count()) * 100, 2))


gender_cnt_per.show()

+------+------------+-------------------+
|gender|gender_count|perc_of_count_total|
+------+------------+-------------------+
|Female|        2994|              58.59|
| Other|           1|               0.02|
|  Male|        2115|              41.39|
+------+------------+-------------------+



In [115]:
male_cnt_total = gender_cnt_per.filter(gender_cnt_per.gender == 'Male').select(F.col('gender_count')).collect()[0]['gender_count']
female_cnt_total = gender_cnt_per.filter(gender_cnt_per.gender == 'Female').select(F.col('gender_count')).collect()[0]['gender_count']

print('Male Count: ', male_cnt_total)
print('Female Count: ', female_cnt_total)

Male Count:  2115
Female Count:  2994


In [112]:
sdf.filter(sdf.gender == "Male").groupBy("gender", "stroke") \
  .agg(count('gender').alias('gender_count')) \
  .withColumn('perc_of_count_total', F.round((F.col('gender_count') / male_cnt_total) * 100, 2)) \
  .show()

+------+------+------------+-------------------+
|gender|stroke|gender_count|perc_of_count_total|
+------+------+------------+-------------------+
|  Male|     1|         108|               5.11|
|  Male|     0|        2007|              94.89|
+------+------+------------+-------------------+



In [113]:
sdf.filter(sdf.gender == "Female").groupBy("gender", "stroke") \
  .agg(count('gender').alias('gender_count')) \
  .withColumn('perc_of_count_total', F.round((F.col('gender_count') / female_cnt_total) * 100, 2)) \
  .show()

+------+------+------------+-------------------+
|gender|stroke|gender_count|perc_of_count_total|
+------+------+------------+-------------------+
|Female|     0|        2853|              95.29|
|Female|     1|         141|               4.71|
+------+------+------------+-------------------+



In [119]:
sdf.filter(sdf.stroke == 1).groupBy("age") \
  .agg(count('age').alias('age_count')) \
  .orderBy(F.col("age").desc()) \
  .show()

+----+---------+
| age|age_count|
+----+---------+
|82.0|        9|
|81.0|       14|
|80.0|       17|
|79.0|       17|
|78.0|       21|
|77.0|        8|
|76.0|       10|
|75.0|        6|
|74.0|        9|
|73.0|        4|
|72.0|        6|
|71.0|        7|
|70.0|        6|
|69.0|        6|
|68.0|        9|
|67.0|        3|
|66.0|        4|
|65.0|        3|
|64.0|        3|
|63.0|        9|
+----+---------+
only showing top 20 rows

