# # currently this cell only outputs schema result which is last executed, to output both in jupyter cell configuration need to be changed
# %config InteractiveShell.ast_node_interactivity = 'all'


--------------------
when errors are deducted in structured APIs
                SQL       Dataframes      Dataset 
syntax error    Runtime   Compile time    Compile time
analysis error  Runtime   runtime         compile time
--------------------

There are some scenarios where you’ll want to consider using RDDs, such as when
you:
• Are using a third-party package that’s written using RDDs
• Can forgo the code optimization, efficient space utilization, and performance
benefits available with DataFrames and Datasets
• Want to precisely instruct Spark how to do a query
----------------------

In [1]:
# %config InteractiveShell.ast_node_interactivity = 'all'

In [2]:
from pyspark.sql import SparkSession

In [3]:
# creating spark session

spark = SparkSession\
            .builder\
            .appName("IOT-ds-analysis")\
            .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/30 18:59:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
import os

iot_ds_path = os.path.join(os.getcwd(),"datasets/iot_devices.json")
print(f'iot dataset path: {iot_ds_path}')

iot dataset path: /Users/pvasud669@apac.comcast.com/repos/learnings/spark/datasets/iot_devices.json


In [5]:
# read dataset file

iot_df = spark.read.json(iot_ds_path, multiLine=False)


                                                                                

In [6]:
iot_df.columns
iot_df.schema
iot_df.select('battery_level','lcd').show()

+-------------+------+
|battery_level|   lcd|
+-------------+------+
|            8| green|
|            7|   red|
|            2|   red|
|            6|yellow|
|            4| green|
|            3|yellow|
|            3|yellow|
|            0|   red|
|            3| green|
|            7|   red|
|            3|   red|
|            0|yellow|
|            6|yellow|
|            1|yellow|
|            9|yellow|
|            4|   red|
|            0|   red|
|            4|yellow|
|            9|   red|
|            7|yellow|
+-------------+------+
only showing top 20 rows



In [7]:
# Detect failing devices with battery levels below a threshold.

from pyspark.sql.functions import col, min, max, avg

iot_df.select(min('battery_level'), max('battery_level'), avg('battery_level')).show()


iot_df.select('device_name')\
        .where(col('battery_level') < 5)\
        .count()\

iot_df.select('device_name')\
        .where(col('battery_level') < 5)\
        .show()

+------------------+------------------+------------------+
|min(battery_level)|max(battery_level)|avg(battery_level)|
+------------------+------------------+------------------+
|                 0|                 9|4.4997678690377665|
+------------------+------------------+------------------+

+--------------------+
|         device_name|
+--------------------+
| device-mac-36TWSKiT|
|therm-stick-5gimp...|
|sensor-pad-6al7RT...|
|meter-gauge-7GeDoanM|
|sensor-pad-8xUD6p...|
| device-mac-9GcjZ2pw|
|meter-gauge-11dlM...|
|sensor-pad-12Y2kIm0o|
|sensor-pad-14QL93...|
|sensor-pad-16aXmI...|
|meter-gauge-17zb8...|
|sensor-pad-18XULN9Xv|
|therm-stick-25kK6...|
|sensor-pad-28Tsud...|
|device-mac-33B94G...|
|sensor-pad-36VQv8...|
|device-mac-39iklY...|
| sensor-pad-40NjeMqS|
|meter-gauge-43RYo...|
| sensor-pad-448DeWGL|
+--------------------+
only showing top 20 rows



In [18]:
# Identify offending countries with high levels of CO2 emissions.
from pyspark.sql.functions import sum

# iot_df.columns

iot_df.groupby('cn')\
        .sum('c02_level').withColumnRenamed('sum(c02_level)','total_co2')\
        .sort('total_co2', ascending=False)\
        .show()

+-----------------+---------+
|               cn|total_co2|
+-----------------+---------+
|    United States| 82270735|
|            China| 17349538|
|            Japan| 14479050|
|Republic of Korea| 14214130|
|          Germany|  9526623|
|   United Kingdom|  7799008|
|           Canada|  7268528|
|           Russia|  7203677|
|           France|  6369745|
|           Brazil|  3896972|
|        Australia|  3734032|
|            Italy|  3481854|
|           Sweden|  3457874|
|           Poland|  3275637|
|      Netherlands|  2991051|
|            Spain|  2773601|
|           Taiwan|  2553613|
|            India|  2220580|
|                 |  2162108|
|   Czech Republic|  1793205|
+-----------------+---------+
only showing top 20 rows



In [21]:
# Compute the min and max values for temperature, battery level, CO2, and humidity.

# iot_df.columns

iot_df.select(min('temp'),max('temp'),min('battery_level'),max('battery_level'),min('c02_level'),max('c02_level'),min('humidity'),max('humidity')).show()



+---------+---------+------------------+------------------+--------------+--------------+-------------+-------------+
|min(temp)|max(temp)|min(battery_level)|max(battery_level)|min(c02_level)|max(c02_level)|min(humidity)|max(humidity)|
+---------+---------+------------------+------------------+--------------+--------------+-------------+-------------+
|       10|       34|                 0|                 9|           800|          1599|           25|           99|
+---------+---------+------------------+------------------+--------------+--------------+-------------+-------------+



In [25]:
# Sort and group by average temperature, CO2, humidity, and country.

iot_df.groupby('cn')\
        .agg(avg('temp').alias('avg_temp'), avg('battery_level'), avg('c02_level'), avg('humidity'))\
        .sort('cn')\
        .show()

+-------------------+------------------+------------------+------------------+------------------+
|                 cn|          avg_temp|avg(battery_level)|    avg(c02_level)|     avg(humidity)|
+-------------------+------------------+------------------+------------------+------------------+
|                   | 22.17292817679558| 4.502209944751381|1194.5348066298343|62.033149171270715|
|        Afghanistan| 24.05263157894737| 4.105263157894737|1228.4736842105262|  66.6842105263158|
|            Albania|          20.09375|           4.28125|            1161.0|          67.21875|
|            Algeria| 20.91176470588235| 4.088235294117647|1210.9705882352941|63.029411764705884|
|     American Samoa|              20.0| 7.333333333333333|1037.6666666666667|56.666666666666664|
|            Andorra|             20.25|              7.75|            1279.0|              75.0|
|             Angola|24.107142857142858| 5.178571428571429| 1115.142857142857| 66.03571428571429|
|           Anguilla

                                                                                

In [29]:
# total number of countries 

iot_df.select('cn').distinct().show()
iot_df.select('cn').distinct().count()

+-------------------+
|                 cn|
+-------------------+
|             Russia|
|           Paraguay|
|           Anguilla|
|              Macao|
|U.S. Virgin Islands|
|              Yemen|
|            Senegal|
|             Sweden|
|  Republic of Korea|
|        Philippines|
|             Jersey|
|          Singapore|
|           Malaysia|
|             Turkey|
|             Malawi|
|              Åland|
|               Iraq|
|            Germany|
|           Cambodia|
|        Afghanistan|
+-------------------+
only showing top 20 rows



209