***Advance usecases of PySpark***

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DataType

In [2]:
spark = SparkSession.builder.appName('sparkAdvanced').master('local[*]').getOrCreate()
spark

In [13]:
# Defining schema for the dataset

schema = StructType([
    StructField('Age', IntegerType(), nullable=True),
    StructField('Gender', StringType(), nullable=True),
    StructField('Occupation', StringType(), nullable=True),
    StructField('SleepHours', FloatType(), nullable=True),
    StructField('PhysicalActivity', FloatType(), nullable=True),
    StructField('CaffeineIntake', IntegerType(), nullable=True),
    StructField('AlcoholConsumption', IntegerType(), nullable=True),
    StructField('Smoking', StringType(), nullable=True),
    StructField('FamilyHistory', StringType(), nullable=True),
    StructField('StressLevel', IntegerType(), nullable=True),
    StructField('HeartRate', IntegerType(), nullable=True),
    StructField('BreathingRate', IntegerType(), nullable=True),
    StructField('SweatingLevel', IntegerType(), nullable=True),
    StructField('Dizziness', StringType(), nullable=True),
    StructField('Medication', StringType(), nullable=True),
    StructField('TherapySessions', IntegerType(), nullable=True),
    StructField('DietQuality', IntegerType(), nullable=True),
    StructField('AnxietyLevel', IntegerType(), nullable=True),
])

In [14]:
main_df = spark.read.csv('socialanxiety/enhanced_anxiety_dataset.csv', schema=schema, header=True)
main_df.show(2)

+---+------+----------+----------+----------------+--------------+------------------+-------+-------------+-----------+---------+-------------+-------------+---------+----------+---------------+-----------+------------+
|Age|Gender|Occupation|SleepHours|PhysicalActivity|CaffeineIntake|AlcoholConsumption|Smoking|FamilyHistory|StressLevel|HeartRate|BreathingRate|SweatingLevel|Dizziness|Medication|TherapySessions|DietQuality|AnxietyLevel|
+---+------+----------+----------+----------------+--------------+------------------+-------+-------------+-----------+---------+-------------+-------------+---------+----------+---------------+-----------+------------+
| 29|Female|    Artist|       6.0|             2.7|           181|                10|    Yes|           No|         10|      114|           14|            4|       No|       Yes|              3|          7|           5|
| 46| Other|     Nurse|       6.2|             5.7|           200|                 8|    Yes|          Yes|          1| 

In [15]:
main_df.printSchema(), main_df.columns

root
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- SleepHours: float (nullable = true)
 |-- PhysicalActivity: float (nullable = true)
 |-- CaffeineIntake: integer (nullable = true)
 |-- AlcoholConsumption: integer (nullable = true)
 |-- Smoking: string (nullable = true)
 |-- FamilyHistory: string (nullable = true)
 |-- StressLevel: integer (nullable = true)
 |-- HeartRate: integer (nullable = true)
 |-- BreathingRate: integer (nullable = true)
 |-- SweatingLevel: integer (nullable = true)
 |-- Dizziness: string (nullable = true)
 |-- Medication: string (nullable = true)
 |-- TherapySessions: integer (nullable = true)
 |-- DietQuality: integer (nullable = true)
 |-- AnxietyLevel: integer (nullable = true)



(None,
 ['Age',
  'Gender',
  'Occupation',
  'SleepHours',
  'PhysicalActivity',
  'CaffeineIntake',
  'AlcoholConsumption',
  'Smoking',
  'FamilyHistory',
  'StressLevel',
  'HeartRate',
  'BreathingRate',
  'SweatingLevel',
  'Dizziness',
  'Medication',
  'TherapySessions',
  'DietQuality',
  'AnxietyLevel'])

In [20]:
df = main_df.select(['Age',
  'Gender',
  'SleepHours',
  'PhysicalActivity',
  'AlcoholConsumption',
  'Smoking',
  'FamilyHistory',
  'StressLevel',
  'HeartRate',
  'SweatingLevel',
  'DietQuality',
  'AnxietyLevel'])
df.show(5)

+---+------+----------+----------------+------------------+-------+-------------+-----------+---------+-------------+-----------+------------+
|Age|Gender|SleepHours|PhysicalActivity|AlcoholConsumption|Smoking|FamilyHistory|StressLevel|HeartRate|SweatingLevel|DietQuality|AnxietyLevel|
+---+------+----------+----------------+------------------+-------+-------------+-----------+---------+-------------+-----------+------------+
| 29|Female|       6.0|             2.7|                10|    Yes|           No|         10|      114|            4|          7|           5|
| 46| Other|       6.2|             5.7|                 8|    Yes|          Yes|          1|       62|            2|          8|           3|
| 64|  Male|       5.0|             3.7|                 4|     No|          Yes|          1|       91|            3|          1|           1|
| 20|Female|       5.8|             2.8|                 6|    Yes|           No|          4|       86|            3|          1|           2|

*Note:* We can create dataframe out of user input by using **spark.createDataFrame(input_data, schema)**

***df.ccolect()***
- We can retrieve data from spark RDD/DataFrame using collect.
- It return data in a list of objects
- NB: collect() should be used cautiously with large datasets because it can cause OutOfMemoryError on the driver. Alternatives like take() or show() are preferred for sampling or previewing large DataFrames.
- docs: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.collect.html

In [25]:
# Here using limit because collect returns all the record in a dataframe. Thast may cause memory error.
df.limit(5).collect()

[Row(Age=29, Gender='Female', SleepHours=6.0, PhysicalActivity=2.700000047683716, AlcoholConsumption=10, Smoking='Yes', FamilyHistory='No', StressLevel=10, HeartRate=114, SweatingLevel=4, DietQuality=7, AnxietyLevel=5),
 Row(Age=46, Gender='Other', SleepHours=6.199999809265137, PhysicalActivity=5.699999809265137, AlcoholConsumption=8, Smoking='Yes', FamilyHistory='Yes', StressLevel=1, HeartRate=62, SweatingLevel=2, DietQuality=8, AnxietyLevel=3),
 Row(Age=64, Gender='Male', SleepHours=5.0, PhysicalActivity=3.700000047683716, AlcoholConsumption=4, Smoking='No', FamilyHistory='Yes', StressLevel=1, HeartRate=91, SweatingLevel=3, DietQuality=1, AnxietyLevel=1),
 Row(Age=20, Gender='Female', SleepHours=5.800000190734863, PhysicalActivity=2.799999952316284, AlcoholConsumption=6, Smoking='Yes', FamilyHistory='No', StressLevel=4, HeartRate=86, SweatingLevel=3, DietQuality=1, AnxietyLevel=2),
 Row(Age=49, Gender='Female', SleepHours=8.199999809265137, PhysicalActivity=2.299999952316284, Alcohol

***RDD(Resilient Distributed Dataset***
- A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable, partitioned collection of elements that can be operated on in parallel.

In [28]:
#  To see the Rdd
df.rdd

MapPartitionsRDD[51] at javaToPython at NativeMethodAccessorImpl.java:0

***df.take(n)***
- Returns Number of first *n* rows of a dataframe.
- docs: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.take.html
- df.head(n) also has similar functionalities.
- df.tail(n) returns last n rows.


In [34]:
df.take(1), df.head(1), df.tail(1)

([Row(Age=29, Gender='Female', SleepHours=6.0, PhysicalActivity=2.700000047683716, AlcoholConsumption=10, Smoking='Yes', FamilyHistory='No', StressLevel=10, HeartRate=114, SweatingLevel=4, DietQuality=7, AnxietyLevel=5)],
 [Row(Age=29, Gender='Female', SleepHours=6.0, PhysicalActivity=2.700000047683716, AlcoholConsumption=10, Smoking='Yes', FamilyHistory='No', StressLevel=10, HeartRate=114, SweatingLevel=4, DietQuality=7, AnxietyLevel=5)],
 [Row(Age=56, Gender='Other', SleepHours=6.099999904632568, PhysicalActivity=1.100000023841858, AlcoholConsumption=11, Smoking='No', FamilyHistory='No', StressLevel=1, HeartRate=66, SweatingLevel=3, DietQuality=8, AnxietyLevel=2)])