In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

import boto3
import pandas as pd
from io import BytesIO

In [2]:
spark = (SparkSession
        .builder
        .appName("heart-disease-risks")
        .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/11 19:45:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
filePath = "/home/ubuntu/heart-disease-risks/data/heart_data.csv"

In [5]:
heartDF = (spark.read.format("csv")
          .option("header","true")
          .option("inferSchema","true")
          .load(filePath))

                                                                                

In [6]:
heartDF.columns

['index',
 'id',
 'age',
 'gender',
 'height',
 'weight',
 'ap_hi',
 'ap_lo',
 'cholesterol',
 'gluc',
 'smoke',
 'alco',
 'active',
 'cardio']

In [7]:
heartDF.show(n=10)

+-----+---+-----+------+------+------+-----+-----+-----------+----+-----+----+------+------+
|index| id|  age|gender|height|weight|ap_hi|ap_lo|cholesterol|gluc|smoke|alco|active|cardio|
+-----+---+-----+------+------+------+-----+-----+-----------+----+-----+----+------+------+
|    0|  0|18393|     2|   168|  62.0|  110|   80|          1|   1|    0|   0|     1|     0|
|    1|  1|20228|     1|   156|  85.0|  140|   90|          3|   1|    0|   0|     1|     1|
|    2|  2|18857|     1|   165|  64.0|  130|   70|          3|   1|    0|   0|     0|     1|
|    3|  3|17623|     2|   169|  82.0|  150|  100|          1|   1|    0|   0|     1|     1|
|    4|  4|17474|     1|   156|  56.0|  100|   60|          1|   1|    0|   0|     0|     0|
|    5|  8|21914|     1|   151|  67.0|  120|   80|          2|   2|    0|   0|     0|     0|
|    6|  9|22113|     1|   157|  93.0|  130|   80|          3|   1|    0|   0|     1|     0|
|    7| 12|22584|     2|   178|  95.0|  130|   90|          3|   3|   

In [8]:
heartDF.createOrReplaceTempView("heart_tbl")

In [9]:
spark.sql("""SELECT cardio, count(id)
             FROM heart_tbl
             GROUP BY cardio""").show(10)

[Stage 3:>                                                          (0 + 1) / 1]

+------+---------+
|cardio|count(id)|
+------+---------+
|     1|    34979|
|     0|    35021|
+------+---------+



                                                                                

In [10]:
### Load the same df from an s3 bucket instead of csv hosted locally

# Define our s3 resource, bucket, & key
s3 = boto3.resource('s3')

#for bucket in s3.buckets.all():
#        print(bucket.name)


bucket_name = 'grand-corndog'
key = 'datasets/heart_data.csv'

obj = s3.Object(bucket_name, key)
with BytesIO(obj.get()['Body'].read()) as bio:
    df = pd.read_csv(bio)

df.head()

Unnamed: 0,index,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [14]:
### Enable PyArrow for faster processing
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [15]:
s3_heartDF = spark.createDataFrame(df)

  [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]


In [16]:
s3_heartDF.show(10)

23/02/11 19:49:04 WARN TaskSetManager: Stage 8 contains a task of very large size (7661 KiB). The maximum recommended task size is 1000 KiB.
+-----+---+-----+------+------+------+-----+-----+-----------+----+-----+----+------+------+
|index| id|  age|gender|height|weight|ap_hi|ap_lo|cholesterol|gluc|smoke|alco|active|cardio|
+-----+---+-----+------+------+------+-----+-----+-----------+----+-----+----+------+------+
|    0|  0|18393|     2|   168|  62.0|  110|   80|          1|   1|    0|   0|     1|     0|
|    1|  1|20228|     1|   156|  85.0|  140|   90|          3|   1|    0|   0|     1|     1|
|    2|  2|18857|     1|   165|  64.0|  130|   70|          3|   1|    0|   0|     0|     1|
|    3|  3|17623|     2|   169|  82.0|  150|  100|          1|   1|    0|   0|     1|     1|
|    4|  4|17474|     1|   156|  56.0|  100|   60|          1|   1|    0|   0|     0|     0|
|    5|  8|21914|     1|   151|  67.0|  120|   80|          2|   2|    0|   0|     0|     0|
|    6|  9|22113|     

[Stage 8:>                                                          (0 + 1) / 1]                                                                                