In [1]:
from pyspark.sql import SparkSession

In [2]:
# Creating a Spark Context
spark = SparkSession.builder.getOrCreate()

In [3]:
data_path = './data'

In [4]:
file_path = f'{data_path}/location_temp.csv'

In [5]:
sdf1 = (spark.read
       .format("csv")
       .option("header", "true")
       .load(file_path))

In [6]:
sdf1.head(5)

[Row(event_date='03/04/2019 19:48:06', location_id='loc0', temp_celcius='29'),
 Row(event_date='03/04/2019 19:53:06', location_id='loc0', temp_celcius='27'),
 Row(event_date='03/04/2019 19:58:06', location_id='loc0', temp_celcius='28'),
 Row(event_date='03/04/2019 20:03:06', location_id='loc0', temp_celcius='30'),
 Row(event_date='03/04/2019 20:08:06', location_id='loc0', temp_celcius='27')]

In [7]:
sdf1.show(5)

+-------------------+-----------+------------+
|         event_date|location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:48:06|       loc0|          29|
|03/04/2019 19:53:06|       loc0|          27|
|03/04/2019 19:58:06|       loc0|          28|
|03/04/2019 20:03:06|       loc0|          30|
|03/04/2019 20:08:06|       loc0|          27|
+-------------------+-----------+------------+
only showing top 5 rows



In [8]:
sdf1.count()

500000

In [9]:
file_path = f'{data_path}/utilization.csv'

In [10]:
# CSV File does not have headers

sdf2 = (spark.read
        .format("csv")
        .option("header", "false")
        .option("inferSchema","true")
        .load(file_path))

In [11]:
sdf2.show(5)

+-------------------+---+----+----+---+
|                _c0|_c1| _c2| _c3|_c4|
+-------------------+---+----+----+---+
|03/05/2019 08:06:14|100|0.57|0.51| 47|
|03/05/2019 08:11:14|100|0.47|0.62| 43|
|03/05/2019 08:16:14|100|0.56|0.57| 62|
|03/05/2019 08:21:14|100|0.57|0.56| 50|
|03/05/2019 08:26:14|100|0.35|0.46| 43|
+-------------------+---+----+----+---+
only showing top 5 rows



In [12]:
sdf2.count()

500000

In [13]:
sdf2 = (sdf2.withColumnRenamed("_c0", "event_datetime")
            .withColumnRenamed("_c1", "server_id")
            .withColumnRenamed("_c2", "cpu_utilization")
            .withColumnRenamed("_c3", "free_memory")
            .withColumnRenamed("_c4", "session_count"))

In [14]:
sdf2.show(5)

+-------------------+---------+---------------+-----------+-------------+
|     event_datetime|server_id|cpu_utilization|free_memory|session_count|
+-------------------+---------+---------------+-----------+-------------+
|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|
|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|
|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|
|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|
|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|
+-------------------+---------+---------------+-----------+-------------+
only showing top 5 rows



In [15]:
sdf3_json_file_path = f'{data_path}/location_temp.json'
sdf1.write.json(sdf3_json_file_path)

In [16]:
sdf4_json_file_path = f'{data_path}/utilization.json'
sdf2.write.json(sdf4_json_file_path)

In [17]:
!ls './data'

[31mlocation_temp.csv[m[m  [31mserver_name.csv[m[m    [34mutilization.json[m[m
[34mlocation_temp.json[m[m [31mutilization.csv[m[m


In [18]:
sdf3 = (spark.read
            .format("json")
            .load(sdf3_json_file_path))

In [19]:
sdf3.head(5)

[Row(event_date='03/05/2019 03:49:40', location_id='loc423', temp_celcius='25'),
 Row(event_date='03/05/2019 03:54:40', location_id='loc423', temp_celcius='24'),
 Row(event_date='03/05/2019 03:59:40', location_id='loc423', temp_celcius='27'),
 Row(event_date='03/05/2019 04:04:40', location_id='loc423', temp_celcius='22'),
 Row(event_date='03/05/2019 04:09:40', location_id='loc423', temp_celcius='23')]

In [20]:
sdf3.show(5)

+-------------------+-----------+------------+
|         event_date|location_id|temp_celcius|
+-------------------+-----------+------------+
|03/05/2019 03:49:40|     loc423|          25|
|03/05/2019 03:54:40|     loc423|          24|
|03/05/2019 03:59:40|     loc423|          27|
|03/05/2019 04:04:40|     loc423|          22|
|03/05/2019 04:09:40|     loc423|          23|
+-------------------+-----------+------------+
only showing top 5 rows



In [21]:
sdf4 = (spark.read
            .format("json")
            .load(sdf4_json_file_path))

In [22]:
! ls './data/utilization.json'

_SUCCESS
part-00000-f90d4f80-a68b-4267-a22d-c0e04ffdc9ef-c000.json
part-00001-f90d4f80-a68b-4267-a22d-c0e04ffdc9ef-c000.json
part-00002-f90d4f80-a68b-4267-a22d-c0e04ffdc9ef-c000.json
part-00003-f90d4f80-a68b-4267-a22d-c0e04ffdc9ef-c000.json
part-00004-f90d4f80-a68b-4267-a22d-c0e04ffdc9ef-c000.json


In [23]:
sdf4.columns

['cpu_utilization',
 'event_datetime',
 'free_memory',
 'server_id',
 'session_count']

In [24]:
sdf4.describe()

DataFrame[summary: string, cpu_utilization: string, event_datetime: string, free_memory: string, server_id: string, session_count: string]

In [25]:
sdf4.describe().show()

+-------+-------------------+-------------------+-------------------+------------------+------------------+
|summary|    cpu_utilization|     event_datetime|        free_memory|         server_id|     session_count|
+-------+-------------------+-------------------+-------------------+------------------+------------------+
|  count|             500000|             500000|             500000|            500000|            500000|
|   mean| 0.6205177400000055|               null| 0.3791280999999993|             124.5|          69.59616|
| stddev|0.15875173872912818|               null|0.15830931278376192|14.430884120553516|14.850676696352838|
|    min|               0.22|03/05/2019 08:06:14|                0.0|               100|                32|
|    max|                1.0|04/09/2019 01:22:46|               0.78|               149|               105|
+-------+-------------------+-------------------+-------------------+------------------+------------------+



In [26]:
sdf4.printSchema()

root
 |-- cpu_utilization: double (nullable = true)
 |-- event_datetime: string (nullable = true)
 |-- free_memory: double (nullable = true)
 |-- server_id: long (nullable = true)
 |-- session_count: long (nullable = true)



In [27]:
sdf4_sample = sdf4.sample(withReplacement=False, fraction=0.1)

In [28]:
sdf4_sort = sdf4_sample.sort('event_datetime')

In [29]:
sdf3.filter(sdf3["location_id"]=="loc0").show(3)

+-------------------+-----------+------------+
|         event_date|location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:48:06|       loc0|          29|
|03/04/2019 19:53:06|       loc0|          27|
|03/04/2019 19:58:06|       loc0|          28|
+-------------------+-----------+------------+
only showing top 3 rows



In [30]:
sdf3.filter(sdf3["location_id"]=="loc0").count()

1000

In [31]:
sdf3.filter("location_id = 'loc1'").show(3)

+-------------------+-----------+------------+
|         event_date|location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:48:06|       loc1|          31|
|03/04/2019 19:53:06|       loc1|          26|
|03/04/2019 19:58:06|       loc1|          31|
+-------------------+-----------+------------+
only showing top 3 rows



In [32]:
sdf3.groupBy("location_id").count().show(3)

+-----------+-----+
|location_id|count|
+-----------+-----+
|     loc196| 1000|
|     loc463| 1000|
|     loc226| 1000|
+-----------+-----+
only showing top 3 rows



In [33]:
sdf3.orderBy("location_id").show(3)

+-------------------+-----------+------------+
|         event_date|location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:58:06|       loc0|          28|
|03/04/2019 19:48:06|       loc0|          29|
|03/04/2019 19:53:06|       loc0|          27|
+-------------------+-----------+------------+
only showing top 3 rows



In [34]:
(sdf3.groupby('location_id')
    .agg({'temp_celcius': 'mean'})
    .show(3))

+-----------+-----------------+
|location_id|avg(temp_celcius)|
+-----------+-----------------+
|     loc196|           29.225|
|     loc463|           23.317|
|     loc226|           25.306|
+-----------+-----------------+
only showing top 3 rows



In [35]:
(sdf3.groupby('location_id')
    .agg({'temp_celcius': 'max'})
    .show(3))

+-----------+-----------------+
|location_id|max(temp_celcius)|
+-----------+-----------------+
|     loc196|               36|
|     loc226|               32|
|     loc463|               30|
+-----------+-----------------+
only showing top 3 rows



In [36]:
(sdf3.groupBy("location_id")
     .agg({'temp_celcius': 'mean'})
     .orderBy("location_id")
     .show(3))

+-----------+-----------------+
|location_id|avg(temp_celcius)|
+-----------+-----------------+
|       loc0|           29.176|
|       loc1|           28.246|
|      loc10|           25.337|
+-----------+-----------------+
only showing top 3 rows



In [37]:
sdf3.write.csv('./data/sdf3.csv')

In [38]:
! ls './data'

[31mlocation_temp.csv[m[m  [34msdf3.csv[m[m           [31mutilization.csv[m[m
[34mlocation_temp.json[m[m [31mserver_name.csv[m[m    [34mutilization.json[m[m


In [39]:
!ls './data/sdf3.csv'

_SUCCESS
part-00000-f5051421-4bc9-4dc7-8e87-612f86d01554-c000.csv
part-00001-f5051421-4bc9-4dc7-8e87-612f86d01554-c000.csv
part-00002-f5051421-4bc9-4dc7-8e87-612f86d01554-c000.csv
part-00003-f5051421-4bc9-4dc7-8e87-612f86d01554-c000.csv
part-00004-f5051421-4bc9-4dc7-8e87-612f86d01554-c000.csv
part-00005-f5051421-4bc9-4dc7-8e87-612f86d01554-c000.csv
part-00006-f5051421-4bc9-4dc7-8e87-612f86d01554-c000.csv
part-00007-f5051421-4bc9-4dc7-8e87-612f86d01554-c000.csv
part-00008-f5051421-4bc9-4dc7-8e87-612f86d01554-c000.csv
part-00009-f5051421-4bc9-4dc7-8e87-612f86d01554-c000.csv
part-00010-f5051421-4bc9-4dc7-8e87-612f86d01554-c000.csv


In [41]:
! head './data/sdf3.csv/part-00000-f5051421-4bc9-4dc7-8e87-612f86d01554-c000.csv'

03/05/2019 03:49:40,loc423,25
03/05/2019 03:54:40,loc423,24
03/05/2019 03:59:40,loc423,27
03/05/2019 04:04:40,loc423,22
03/05/2019 04:09:40,loc423,23
03/05/2019 04:14:40,loc423,26
03/05/2019 04:19:40,loc423,28
03/05/2019 04:24:40,loc423,22
03/05/2019 04:29:40,loc423,22
03/05/2019 04:34:40,loc423,24


In [42]:
sdf3.write.json('./data/sdf3.json')

In [43]:
! ls './data'

[31mlocation_temp.csv[m[m  [34msdf3.csv[m[m           [31mserver_name.csv[m[m    [34mutilization.json[m[m
[34mlocation_temp.json[m[m [34msdf3.json[m[m          [31mutilization.csv[m[m


In [44]:
!ls './data/sdf3.json'

_SUCCESS
part-00000-be0377a9-4877-4524-92ce-8e611155b4c5-c000.json
part-00001-be0377a9-4877-4524-92ce-8e611155b4c5-c000.json
part-00002-be0377a9-4877-4524-92ce-8e611155b4c5-c000.json
part-00003-be0377a9-4877-4524-92ce-8e611155b4c5-c000.json
part-00004-be0377a9-4877-4524-92ce-8e611155b4c5-c000.json
part-00005-be0377a9-4877-4524-92ce-8e611155b4c5-c000.json
part-00006-be0377a9-4877-4524-92ce-8e611155b4c5-c000.json
part-00007-be0377a9-4877-4524-92ce-8e611155b4c5-c000.json
part-00008-be0377a9-4877-4524-92ce-8e611155b4c5-c000.json
part-00009-be0377a9-4877-4524-92ce-8e611155b4c5-c000.json
part-00010-be0377a9-4877-4524-92ce-8e611155b4c5-c000.json


In [45]:
! head './data/sdf3.json/part-00000-be0377a9-4877-4524-92ce-8e611155b4c5-c000.json'

{"event_date":"03/05/2019 03:49:40","location_id":"loc423","temp_celcius":"25"}
{"event_date":"03/05/2019 03:54:40","location_id":"loc423","temp_celcius":"24"}
{"event_date":"03/05/2019 03:59:40","location_id":"loc423","temp_celcius":"27"}
{"event_date":"03/05/2019 04:04:40","location_id":"loc423","temp_celcius":"22"}
{"event_date":"03/05/2019 04:09:40","location_id":"loc423","temp_celcius":"23"}
{"event_date":"03/05/2019 04:14:40","location_id":"loc423","temp_celcius":"26"}
{"event_date":"03/05/2019 04:19:40","location_id":"loc423","temp_celcius":"28"}
{"event_date":"03/05/2019 04:24:40","location_id":"loc423","temp_celcius":"22"}
{"event_date":"03/05/2019 04:29:40","location_id":"loc423","temp_celcius":"22"}
{"event_date":"03/05/2019 04:34:40","location_id":"loc423","temp_celcius":"24"}
