In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [18]:
if __name__ == "__main__":
    spark = (SparkSession
            .builder
            .appName("IoT") 
            .getOrCreate())

In [19]:
iot_file = "C:/Users/sean.cornillie/Education/LearningSparkV2/Spark_Dev/datasets/iot_devices.json"

In [20]:
iot_df = spark.read.json(iot_file)

In [21]:
print(iot_df.printSchema())

root
 |-- battery_level: long (nullable = true)
 |-- c02_level: long (nullable = true)
 |-- cca2: string (nullable = true)
 |-- cca3: string (nullable = true)
 |-- cn: string (nullable = true)
 |-- device_id: long (nullable = true)
 |-- device_name: string (nullable = true)
 |-- humidity: long (nullable = true)
 |-- ip: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- lcd: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- scale: string (nullable = true)
 |-- temp: long (nullable = true)
 |-- timestamp: long (nullable = true)

None


In [22]:
iot_df.show(n=10)

+-------------+---------+----+----+-------------+---------+--------------------+--------+---------------+--------+------+---------+-------+----+-------------+
|battery_level|c02_level|cca2|cca3|           cn|device_id|         device_name|humidity|             ip|latitude|   lcd|longitude|  scale|temp|    timestamp|
+-------------+---------+----+----+-------------+---------+--------------------+--------+---------------+--------+------+---------+-------+----+-------------+
|            8|      868|  US| USA|United States|        1|meter-gauge-1xbYRYcj|      51|   68.161.225.1|    38.0| green|    -97.0|Celsius|  34|1458444054093|
|            7|     1473|  NO| NOR|       Norway|        2|   sensor-pad-2n2Pea|      70|  213.161.254.1|   62.47|   red|     6.15|Celsius|  11|1458444054119|
|            2|     1556|  IT| ITA|        Italy|        3| device-mac-36TWSKiT|      44|      88.36.5.1|   42.83|   red|    12.83|Celsius|  19|1458444054120|
|            6|     1080|  US| USA|United Stat

### 1. Detect failing devices with battery levels below a threshold.

In [23]:
### First want to establish max/min/avg battery level
(iot_df
    .select(avg("battery_level")
           ,min("battery_level")
           ,max("battery_level"))
    .show())

+------------------+------------------+------------------+
|avg(battery_level)|min(battery_level)|max(battery_level)|
+------------------+------------------+------------------+
|4.4997678690377665|                 0|                 9|
+------------------+------------------+------------------+



In [24]:
### Then we can set a threshold battery level equal to whatever we want. We chose 3.
(iot_df
    .select("device_id", "device_name")
    .where(col("battery_level") < 3)
    .orderBy("device_id")
    .show()
)

+---------+--------------------+
|device_id|         device_name|
+---------+--------------------+
|        3| device-mac-36TWSKiT|
|        8|sensor-pad-8xUD6p...|
|       12|sensor-pad-12Y2kIm0o|
|       14|sensor-pad-14QL93...|
|       17|meter-gauge-17zb8...|
|       36|sensor-pad-36VQv8...|
|       39|device-mac-39iklY...|
|       40| sensor-pad-40NjeMqS|
|       43|meter-gauge-43RYo...|
|       44| sensor-pad-448DeWGL|
|       52| sensor-pad-52eFObBC|
|       77|meter-gauge-77IKW...|
|       80|sensor-pad-80TY4d...|
|       84|sensor-pad-84jla9J5O|
|       85| therm-stick-85NcuaO|
|       87|device-mac-87EJxth2l|
|       92| sensor-pad-92vxuq7e|
|       98|sensor-pad-98mJQA...|
|       99|device-mac-99Xh5Y...|
|      102|sensor-pad-102D03...|
+---------+--------------------+
only showing top 20 rows



### 2. Identify offending countries with high levels of CO2 emissions.

In [25]:
### Again, we first want to establish max/min/avg of CO2 emissions
(iot_df
    .select(avg("c02_level")
           ,min("c02_level")
           ,max("c02_level"))
    .show())

+------------------+--------------+--------------+
|    avg(c02_level)|min(c02_level)|max(c02_level)|
+------------------+--------------+--------------+
|1199.7639429967098|           800|          1599|
+------------------+--------------+--------------+



In [26]:
### Then we can pull avg CO2 by country and decide anyone with CO2 > 1300 is high.
(iot_df
    .where(col("cn").isNotNull())
    .groupBy("cn")
    .agg(avg("c02_level").alias("AvgCO2"))
    .where(col("AvgCO2") > 1300)
    .sort(desc("AvgCO2"))
    .show()
)

+----------------+------------------+
|              cn|            AvgCO2|
+----------------+------------------+
|           Gabon|            1523.0|
|Falkland Islands|            1424.0|
|          Monaco|            1421.5|
|          Kosovo|            1389.0|
|      San Marino|1379.6666666666667|
|         Liberia|            1374.5|
|           Syria|            1345.8|
|      Mauritania|1344.4285714285713|
|           Congo|          1333.375|
|           Tonga|            1323.0|
|      East Timor|            1310.0|
|          Guinea|            1308.0|
|        Botswana|1302.6666666666667|
+----------------+------------------+



### 3. Compute the min and max values for temperature, battery level, CO2, and humidity.

In [27]:
(iot_df
    .select(min("temp")
           ,max("temp"))
    .show())

(iot_df
    .select(min("battery_level")
           ,max("battery_level"))
    .show())

(iot_df
    .select(min("c02_level")
           ,max("c02_level"))
    .show())

(iot_df
    .select(min("humidity")
           ,max("humidity"))
    .show())


+---------+---------+
|min(temp)|max(temp)|
+---------+---------+
|       10|       34|
+---------+---------+

+------------------+------------------+
|min(battery_level)|max(battery_level)|
+------------------+------------------+
|                 0|                 9|
+------------------+------------------+

+--------------+--------------+
|min(c02_level)|max(c02_level)|
+--------------+--------------+
|           800|          1599|
+--------------+--------------+

+-------------+-------------+
|min(humidity)|max(humidity)|
+-------------+-------------+
|           25|           99|
+-------------+-------------+



### 4. Sort and group by average temperature, CO2, humidity, and country.

In [32]:
(iot_df
    .groupBy("cn")
    .agg(avg("temp"), avg("c02_level"), avg("humidity"))
    .orderBy("cn")
    #.explain(True)
    .show()
)

+-------------------+------------------+------------------+------------------+
|                 cn|         avg(temp)|    avg(c02_level)|     avg(humidity)|
+-------------------+------------------+------------------+------------------+
|                   | 22.17292817679558|1194.5348066298343|62.033149171270715|
|        Afghanistan| 24.05263157894737|1228.4736842105262|  66.6842105263158|
|            Albania|          20.09375|            1161.0|          67.21875|
|            Algeria| 20.91176470588235|1210.9705882352941|63.029411764705884|
|     American Samoa|              20.0|1037.6666666666667|56.666666666666664|
|            Andorra|             20.25|            1279.0|              75.0|
|             Angola|24.107142857142858| 1115.142857142857| 66.03571428571429|
|           Anguilla|31.142857142857142| 1165.142857142857|50.714285714285715|
|Antigua and Barbuda|20.933333333333334|1171.4833333333333|              58.6|
|          Argentina|22.060327198364007|1195.1605316

In [13]:
spark.stop()