In [1]:
# ! pip install findspark

In [2]:
# initiate PySpark
import findspark
findspark.init()

Dibandingkan dengan menggunakan pengaturan variabel seperti PYSPARK_DRIVER_PYTHON="jupyter" dan PYSPARK_DRIVER_PYTHON_OPTS="notebook" agar dapat menggunakan jupyter notebook dengan Spark, saya menggunakan library findspark.

---

Instead of using variable settings like PYSPARK_DRIVER_PYTHON = "jupyter" and PYSPARK_DRIVER_PYTHON_OPTS = "notebook" in order to use jupyter notebooks with Spark, I used the findspark library.

In [3]:
from pyspark.sql import SparkSession

In [4]:
# initiate spark session
spark = SparkSession.builder.getOrCreate()

In [5]:
spark

#### Load data into DataFrames: CSV

In [6]:
DATA_PATH = "Ex_Files_Spark_SQL_DataFrames/Exercise Files/Data"
FILE_PATH_1 = DATA_PATH + "/location_temp.csv"

In [7]:
# load csv file
df1 = spark.read.format("csv").option("header", "true").load(FILE_PATH_1)

# another way to get the same result
# df1 = spark.read.csv(FILE_PATH_1, header=True)

In [8]:
# shows the first ten rows
df1.head(10)

[Row(event_date='03/04/2019 19:48:06', location_id='loc0', temp_celcius='29'),
 Row(event_date='03/04/2019 19:53:06', location_id='loc0', temp_celcius='27'),
 Row(event_date='03/04/2019 19:58:06', location_id='loc0', temp_celcius='28'),
 Row(event_date='03/04/2019 20:03:06', location_id='loc0', temp_celcius='30'),
 Row(event_date='03/04/2019 20:08:06', location_id='loc0', temp_celcius='27'),
 Row(event_date='03/04/2019 20:13:06', location_id='loc0', temp_celcius='27'),
 Row(event_date='03/04/2019 20:18:06', location_id='loc0', temp_celcius='27'),
 Row(event_date='03/04/2019 20:23:06', location_id='loc0', temp_celcius='29'),
 Row(event_date='03/04/2019 20:28:06', location_id='loc0', temp_celcius='32'),
 Row(event_date='03/04/2019 20:33:06', location_id='loc0', temp_celcius='35')]

In [9]:
# shows in tabular format
df1.show()

+-------------------+-----------+------------+
|         event_date|location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:48:06|       loc0|          29|
|03/04/2019 19:53:06|       loc0|          27|
|03/04/2019 19:58:06|       loc0|          28|
|03/04/2019 20:03:06|       loc0|          30|
|03/04/2019 20:08:06|       loc0|          27|
|03/04/2019 20:13:06|       loc0|          27|
|03/04/2019 20:18:06|       loc0|          27|
|03/04/2019 20:23:06|       loc0|          29|
|03/04/2019 20:28:06|       loc0|          32|
|03/04/2019 20:33:06|       loc0|          35|
|03/04/2019 20:38:06|       loc0|          32|
|03/04/2019 20:43:06|       loc0|          28|
|03/04/2019 20:48:06|       loc0|          28|
|03/04/2019 20:53:06|       loc0|          32|
|03/04/2019 20:58:06|       loc0|          34|
|03/04/2019 21:03:06|       loc0|          33|
|03/04/2019 21:08:06|       loc0|          27|
|03/04/2019 21:13:06|       loc0|          28|
|03/04/2019 2

In [10]:
# count the number of rows
df1.count()

500000

In [11]:
# load second file, no header csv file
FILE_PATH_2 = DATA_PATH + "/utilization.csv"
df2 = spark.read.format("csv").option("header", "false").option("inferSchema", "true").load(FILE_PATH_2)

In [12]:
df2.show()

+-------------------+---+----+----+---+
|                _c0|_c1| _c2| _c3|_c4|
+-------------------+---+----+----+---+
|03/05/2019 08:06:14|100|0.57|0.51| 47|
|03/05/2019 08:11:14|100|0.47|0.62| 43|
|03/05/2019 08:16:14|100|0.56|0.57| 62|
|03/05/2019 08:21:14|100|0.57|0.56| 50|
|03/05/2019 08:26:14|100|0.35|0.46| 43|
|03/05/2019 08:31:14|100|0.41|0.58| 48|
|03/05/2019 08:36:14|100|0.57|0.35| 58|
|03/05/2019 08:41:14|100|0.41| 0.4| 58|
|03/05/2019 08:46:14|100|0.53|0.35| 62|
|03/05/2019 08:51:14|100|0.51| 0.6| 45|
|03/05/2019 08:56:14|100|0.32|0.37| 47|
|03/05/2019 09:01:14|100|0.62|0.59| 60|
|03/05/2019 09:06:14|100|0.66|0.72| 57|
|03/05/2019 09:11:14|100|0.54|0.54| 44|
|03/05/2019 09:16:14|100|0.29| 0.4| 47|
|03/05/2019 09:21:14|100|0.43|0.68| 66|
|03/05/2019 09:26:14|100|0.49|0.66| 65|
|03/05/2019 09:31:14|100|0.64|0.55| 66|
|03/05/2019 09:36:14|100|0.42| 0.6| 42|
|03/05/2019 09:41:14|100|0.55|0.59| 63|
+-------------------+---+----+----+---+
only showing top 20 rows



In [13]:
# show schema
df2.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: integer (nullable = true)



In [14]:
# rename columns
df2 = df2.withColumnRenamed("_c0", "event_datetime") \
         .withColumnRenamed("_c1", "server_id") \
         .withColumnRenamed("_c2", "cpu_utilization") \
         .withColumnRenamed("_c3", "free_memory") \
         .withColumnRenamed("_c4", "session_count")

In [15]:
df2.show()

+-------------------+---------+---------------+-----------+-------------+
|     event_datetime|server_id|cpu_utilization|free_memory|session_count|
+-------------------+---------+---------------+-----------+-------------+
|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|
|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|
|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|
|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|
|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|
|03/05/2019 08:31:14|      100|           0.41|       0.58|           48|
|03/05/2019 08:36:14|      100|           0.57|       0.35|           58|
|03/05/2019 08:41:14|      100|           0.41|        0.4|           58|
|03/05/2019 08:46:14|      100|           0.53|       0.35|           62|
|03/05/2019 08:51:14|      100|           0.51|        0.6|           45|
|03/05/2019 08:56:14|      100|       

In [16]:
# df2.write.csv("df2.csv")

#### Load data into DataFrames: JSON

---

Karena tidak ada file json yang disediakan maka saya mencoba dengan menyimpan data dari DataFrames sebelumnya ke format json.

Since no json file is provided, I tried saving the data from the previous DataFrames to json format.

In [17]:
# menyimpan DataFrame ke format json
df2.coalesce(1).write.mode("overwrite").format("json").save("df2.json")

In [18]:
! dir df2.json

 Volume in drive D is DATA
 Volume Serial Number is 5291-7DC1

 Directory of D:\LinkedIn_Learning\Intro_to_SparkSQL_and_Dataframe\df2.json

11/01/2021  18.33    <DIR>          .
11/01/2021  18.33    <DIR>          ..
11/01/2021  18.33           460.208 .part-00000-cff3aebd-af7d-4309-9f24-ba399d67eec5-c000.json.crc
11/01/2021  18.33                 8 ._SUCCESS.crc
11/01/2021  18.33        58.905.124 part-00000-cff3aebd-af7d-4309-9f24-ba399d67eec5-c000.json
11/01/2021  18.33                 0 _SUCCESS
               4 File(s)     59.365.340 bytes
               2 Dir(s)  107.361.665.024 bytes free


Sebelumnya saya menggunakan Spark dengan versi spark-3.0.1-bin-hadoop3.2 dan Python 3.8, namun saya mendapatkan eror ketika ingin menyimpan (DataFrame.write) (belum tahu apa penyebabnya). Kemudian **secara random** mengganti dengan spark-2.4.7-bin-hadoop2.7 dan Python 3.7 (karena ketika digunakan dengan Python 3.8 akan menimbulkan eror saat membuat Spark Session).

---

Previously I used Spark with versions spark-3.0.1-bin-hadoop3.2 and Python 3.8, but I got an error when I wanted to save (DataFrame.write) (don't know what the cause was). Then **randomly** replace with spark-2.4.7-bin-hadoop2.7 and Python 3.7 (because when used with Python 3.8 it will cause an error when creating a Spark Session).

In [19]:
# read file json
df3 = spark.read.format("json").load("df2.json")

In [20]:
df3.head(10)

[Row(cpu_utilization=0.57, event_datetime='03/05/2019 08:06:14', free_memory=0.51, server_id=100, session_count=47),
 Row(cpu_utilization=0.47, event_datetime='03/05/2019 08:11:14', free_memory=0.62, server_id=100, session_count=43),
 Row(cpu_utilization=0.56, event_datetime='03/05/2019 08:16:14', free_memory=0.57, server_id=100, session_count=62),
 Row(cpu_utilization=0.57, event_datetime='03/05/2019 08:21:14', free_memory=0.56, server_id=100, session_count=50),
 Row(cpu_utilization=0.35, event_datetime='03/05/2019 08:26:14', free_memory=0.46, server_id=100, session_count=43),
 Row(cpu_utilization=0.41, event_datetime='03/05/2019 08:31:14', free_memory=0.58, server_id=100, session_count=48),
 Row(cpu_utilization=0.57, event_datetime='03/05/2019 08:36:14', free_memory=0.35, server_id=100, session_count=58),
 Row(cpu_utilization=0.41, event_datetime='03/05/2019 08:41:14', free_memory=0.4, server_id=100, session_count=58),
 Row(cpu_utilization=0.53, event_datetime='03/05/2019 08:46:14', 

In [21]:
df3.show()

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|
|           0.41|03/05/2019 08:41:14|        0.4|      100|           58|
|           0.53|03/05/2019 08:46:14|       0.35|      100|           62|
|           0.51|03/05/2019 08:51:14|        0.6|      100|           45|
|           0.32|03/05/2019 08:56:14| 

In [22]:
# list the columns
df3.columns

['cpu_utilization',
 'event_datetime',
 'free_memory',
 'server_id',
 'session_count']

In [23]:
# get sample
df3_sample = df3.sample(False, fraction=0.1) # False --> withReplacement=False

In [24]:
df3_sample.count()

49564

In [25]:
df3_sample.show(10)

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.49|03/05/2019 09:26:14|       0.66|      100|           65|
|           0.55|03/05/2019 09:41:14|       0.59|      100|           63|
|           0.63|03/05/2019 10:11:14|       0.63|      100|           59|
|           0.56|03/05/2019 10:26:14|       0.69|      100|           44|
|           0.45|03/05/2019 11:11:14|       0.73|      100|           45|
|           0.66|03/05/2019 11:31:14|        0.6|      100|           56|
|           0.39|03/05/2019 11:36:14|       0.54|      100|           70|
|           0.52|03/05/2019 11:51:14|       0.68|      100|           41|
|           0.64|03/05/2019 11:56:14|       0.62|      100|           68|
|            0.4|03/05/2019 12:06:14|       0.36|      100|           53|
+---------------+-------------------+-

In [26]:
# sorting
df3_sample.sort("event_datetime").show(10)

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.51|03/05/2019 08:07:17|        0.4|      136|           76|
|           0.44|03/05/2019 08:11:41|       0.38|      116|           49|
|           0.64|03/05/2019 08:12:23|        0.3|      139|           96|
|           0.94|03/05/2019 08:12:41|       0.27|      148|           68|
|           0.59|03/05/2019 08:16:36|       0.05|      113|           74|
|           0.55|03/05/2019 08:16:40|        0.4|      115|           64|
|           0.53|03/05/2019 08:16:46|       0.52|      119|           57|
|           0.55|03/05/2019 08:16:48|       0.61|      120|           54|
|           0.89|03/05/2019 08:16:50|       0.49|      121|           90|
|           0.54|03/05/2019 08:17:06|       0.28|      130|           55|
+---------------+-------------------+-

#### Filter data with DataFrame API

In [27]:
# we use data from location_temp.csv
df1.show()

+-------------------+-----------+------------+
|         event_date|location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:48:06|       loc0|          29|
|03/04/2019 19:53:06|       loc0|          27|
|03/04/2019 19:58:06|       loc0|          28|
|03/04/2019 20:03:06|       loc0|          30|
|03/04/2019 20:08:06|       loc0|          27|
|03/04/2019 20:13:06|       loc0|          27|
|03/04/2019 20:18:06|       loc0|          27|
|03/04/2019 20:23:06|       loc0|          29|
|03/04/2019 20:28:06|       loc0|          32|
|03/04/2019 20:33:06|       loc0|          35|
|03/04/2019 20:38:06|       loc0|          32|
|03/04/2019 20:43:06|       loc0|          28|
|03/04/2019 20:48:06|       loc0|          28|
|03/04/2019 20:53:06|       loc0|          32|
|03/04/2019 20:58:06|       loc0|          34|
|03/04/2019 21:03:06|       loc0|          33|
|03/04/2019 21:08:06|       loc0|          27|
|03/04/2019 21:13:06|       loc0|          28|
|03/04/2019 2

In [28]:
df1.filter(df1["location_id"]=="loc0").count()

1000

In [29]:
df1_loc1 = df1.filter(df1["location_id"]=="loc1")

In [30]:
df1_loc1.show()

+-------------------+-----------+------------+
|         event_date|location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:48:06|       loc1|          31|
|03/04/2019 19:53:06|       loc1|          26|
|03/04/2019 19:58:06|       loc1|          31|
|03/04/2019 20:03:06|       loc1|          26|
|03/04/2019 20:08:06|       loc1|          28|
|03/04/2019 20:13:06|       loc1|          27|
|03/04/2019 20:18:06|       loc1|          30|
|03/04/2019 20:23:06|       loc1|          28|
|03/04/2019 20:28:06|       loc1|          28|
|03/04/2019 20:33:06|       loc1|          27|
|03/04/2019 20:38:06|       loc1|          30|
|03/04/2019 20:43:06|       loc1|          32|
|03/04/2019 20:48:06|       loc1|          26|
|03/04/2019 20:53:06|       loc1|          30|
|03/04/2019 20:58:06|       loc1|          26|
|03/04/2019 21:03:06|       loc1|          28|
|03/04/2019 21:08:06|       loc1|          27|
|03/04/2019 21:13:06|       loc1|          28|
|03/04/2019 2

In [31]:
df1_loc1.count()

1000

#### Aggregate data with DataFrame API

In [32]:
df1.show()

+-------------------+-----------+------------+
|         event_date|location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:48:06|       loc0|          29|
|03/04/2019 19:53:06|       loc0|          27|
|03/04/2019 19:58:06|       loc0|          28|
|03/04/2019 20:03:06|       loc0|          30|
|03/04/2019 20:08:06|       loc0|          27|
|03/04/2019 20:13:06|       loc0|          27|
|03/04/2019 20:18:06|       loc0|          27|
|03/04/2019 20:23:06|       loc0|          29|
|03/04/2019 20:28:06|       loc0|          32|
|03/04/2019 20:33:06|       loc0|          35|
|03/04/2019 20:38:06|       loc0|          32|
|03/04/2019 20:43:06|       loc0|          28|
|03/04/2019 20:48:06|       loc0|          28|
|03/04/2019 20:53:06|       loc0|          32|
|03/04/2019 20:58:06|       loc0|          34|
|03/04/2019 21:03:06|       loc0|          33|
|03/04/2019 21:08:06|       loc0|          27|
|03/04/2019 21:13:06|       loc0|          28|
|03/04/2019 2

In [33]:
# get to know number of each location id
df1_groupby_loc = df1.groupBy("location_id").count()
df1_groupby_loc.show()

+-----------+-----+
|location_id|count|
+-----------+-----+
|     loc196| 1000|
|     loc226| 1000|
|     loc463| 1000|
|     loc150| 1000|
|     loc292| 1000|
|     loc311| 1000|
|      loc22| 1000|
|     loc351| 1000|
|     loc370| 1000|
|     loc419| 1000|
|      loc31| 1000|
|     loc305| 1000|
|      loc82| 1000|
|      loc90| 1000|
|     loc118| 1000|
|     loc195| 1000|
|     loc208| 1000|
|      loc39| 1000|
|      loc75| 1000|
|     loc228| 1000|
+-----------+-----+
only showing top 20 rows



In [34]:
df1_groupby_loc.orderBy("location_id").show()

+-----------+-----+
|location_id|count|
+-----------+-----+
|       loc0| 1000|
|       loc1| 1000|
|      loc10| 1000|
|     loc100| 1000|
|     loc101| 1000|
|     loc102| 1000|
|     loc103| 1000|
|     loc104| 1000|
|     loc105| 1000|
|     loc106| 1000|
|     loc107| 1000|
|     loc108| 1000|
|     loc109| 1000|
|      loc11| 1000|
|     loc110| 1000|
|     loc111| 1000|
|     loc112| 1000|
|     loc113| 1000|
|     loc114| 1000|
|     loc115| 1000|
+-----------+-----+
only showing top 20 rows



In [35]:
df1_groupby_loc.count()

500

In [36]:
# aggregate: mean
df1.groupBy("location_id").agg({"temp_celcius": "mean"}).show()

+-----------+-----------------+
|location_id|avg(temp_celcius)|
+-----------+-----------------+
|     loc196|           29.225|
|     loc226|           25.306|
|     loc463|           23.317|
|     loc150|           32.188|
|     loc292|           29.159|
|     loc311|           24.308|
|      loc22|           28.251|
|     loc351|           28.194|
|     loc370|            29.14|
|     loc419|           29.141|
|      loc31|           25.196|
|     loc305|           27.314|
|      loc82|           27.355|
|      loc90|           23.216|
|     loc118|           24.219|
|     loc195|            27.25|
|     loc208|           26.206|
|      loc39|           25.199|
|      loc75|           23.209|
|     loc228|           27.295|
+-----------+-----------------+
only showing top 20 rows



In [37]:
# aggregate: max
df1.groupBy("location_id").agg({"temp_celcius": "max"}).show()

+-----------+-----------------+
|location_id|max(temp_celcius)|
+-----------+-----------------+
|     loc196|               36|
|     loc226|               32|
|     loc463|               30|
|     loc150|               39|
|     loc292|               36|
|     loc311|               31|
|      loc22|               35|
|     loc351|               35|
|     loc370|               36|
|     loc419|               36|
|     loc305|               34|
|      loc31|               32|
|     loc118|               31|
|     loc195|               34|
|     loc208|               33|
|      loc82|               34|
|      loc90|               30|
|     loc228|               34|
|      loc39|               32|
|      loc75|               30|
+-----------+-----------------+
only showing top 20 rows



#### Sample data from DataFrame

In [38]:
df1_s1 = df1.sample(withReplacement=False, fraction=0.1) # fraction value is aproximation

In [39]:
# kita lihat rata-rata setiap lokasi untuk data contoh
# catatan data diurutkan berdasarkan lexical ordering
df1_s1.groupBy("location_id") \
      .agg({"temp_celcius": "mean"}) \
      .orderBy("location_id") \
      .show(10)

+-----------+------------------+
|location_id| avg(temp_celcius)|
+-----------+------------------+
|       loc0|           29.1875|
|       loc1| 28.36036036036036|
|      loc10|25.574074074074073|
|     loc100|26.862068965517242|
|     loc101|25.476635514018692|
|     loc102|30.333333333333332|
|     loc103| 25.37719298245614|
|     loc104|25.796116504854368|
|     loc105| 26.10112359550562|
|     loc106|27.460176991150444|
+-----------+------------------+
only showing top 10 rows



In [40]:
# kita lihat dan bandingkang
# rata-rata setiap lokasi untuk data asli
df1.groupBy("location_id") \
   .agg({"temp_celcius": "mean"}) \
   .orderBy("location_id") \
   .show(10)

+-----------+-----------------+
|location_id|avg(temp_celcius)|
+-----------+-----------------+
|       loc0|           29.176|
|       loc1|           28.246|
|      loc10|           25.337|
|     loc100|           27.297|
|     loc101|           25.317|
|     loc102|           30.327|
|     loc103|           25.341|
|     loc104|           26.204|
|     loc105|           26.217|
|     loc106|           27.201|
+-----------+-----------------+
only showing top 10 rows



#### Save data from DataFrames

In [41]:
df1.write.mode("overwrite").format("csv").save("location_temp.csv")

In [42]:
! dir location_temp.csv

 Volume in drive D is DATA
 Volume Serial Number is 5291-7DC1

 Directory of D:\LinkedIn_Learning\Intro_to_SparkSQL_and_Dataframe\location_temp.csv

11/01/2021  18.35    <DIR>          .
11/01/2021  18.35    <DIR>          ..
11/01/2021  18.35            77.072 .part-00000-8156df2c-84e3-4865-bc2d-a621a537e9dd-c000.csv.crc
11/01/2021  18.35            43.184 .part-00001-8156df2c-84e3-4865-bc2d-a621a537e9dd-c000.csv.crc
11/01/2021  18.35                 8 ._SUCCESS.crc
11/01/2021  18.35         9.863.878 part-00000-8156df2c-84e3-4865-bc2d-a621a537e9dd-c000.csv
11/01/2021  18.35         5.526.122 part-00001-8156df2c-84e3-4865-bc2d-a621a537e9dd-c000.csv
11/01/2021  18.35                 0 _SUCCESS
               6 File(s)     15.510.264 bytes
               2 Dir(s)  107.361.640.448 bytes free


bukan file melainkan folder dimana data dari DataFrames tersimpan. Di dalam folder tersebut terdapat file csv, file disimpan menjadi dua bagian / subset part-00000-... dan part-00001-...

#### Querying DataFrame with SQL

In [43]:
# we use "utilization" file, df2 or df3 DataFrames
df3.show(10)

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|
|           0.41|03/05/2019 08:41:14|        0.4|      100|           58|
|           0.53|03/05/2019 08:46:14|       0.35|      100|           62|
|           0.51|03/05/2019 08:51:14|        0.6|      100|           45|
+---------------+-------------------+-

In [44]:
df3.count()

500000

In [45]:
# To perform SQL queries, a temporary view of the table is required
df3.createOrReplaceTempView("utilization")

In [46]:
df_sql1 = spark.sql("SELECT * FROM utilization LIMIT 10")
df_sql1.show()

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|
|           0.41|03/05/2019 08:41:14|        0.4|      100|           58|
|           0.53|03/05/2019 08:46:14|       0.35|      100|           62|
|           0.51|03/05/2019 08:51:14|        0.6|      100|           45|
+---------------+-------------------+-

In [47]:
df_sql1 = spark.sql("SELECT event_datetime, cpu_utilization FROM utilization LIMIT 10")
df_sql1.show()

+-------------------+---------------+
|     event_datetime|cpu_utilization|
+-------------------+---------------+
|03/05/2019 08:06:14|           0.57|
|03/05/2019 08:11:14|           0.47|
|03/05/2019 08:16:14|           0.56|
|03/05/2019 08:21:14|           0.57|
|03/05/2019 08:26:14|           0.35|
|03/05/2019 08:31:14|           0.41|
|03/05/2019 08:36:14|           0.57|
|03/05/2019 08:41:14|           0.41|
|03/05/2019 08:46:14|           0.53|
|03/05/2019 08:51:14|           0.51|
+-------------------+---------------+



In [48]:
df_sql1 = spark.sql("SELECT server_id AS sid, session_count AS sc FROM utilization")
df_sql1.show()

+---+---+
|sid| sc|
+---+---+
|100| 47|
|100| 43|
|100| 62|
|100| 50|
|100| 43|
|100| 48|
|100| 58|
|100| 58|
|100| 62|
|100| 45|
|100| 47|
|100| 60|
|100| 57|
|100| 44|
|100| 47|
|100| 66|
|100| 65|
|100| 66|
|100| 42|
|100| 63|
+---+---+
only showing top 20 rows



#### Filtering DataFrames with SQL

In [49]:
df_sql1 = spark.sql("SELECT * FROM utilization WHERE server_id = 120")
df_sql1.show()

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.66|03/05/2019 08:06:48|       0.31|      120|           54|
|           0.58|03/05/2019 08:11:48|       0.38|      120|           64|
|           0.55|03/05/2019 08:16:48|       0.61|      120|           54|
|            0.7|03/05/2019 08:21:48|       0.35|      120|           80|
|            0.6|03/05/2019 08:26:48|       0.39|      120|           71|
|           0.53|03/05/2019 08:31:48|       0.35|      120|           49|
|           0.73|03/05/2019 08:36:48|       0.42|      120|           73|
|           0.41|03/05/2019 08:41:48|        0.6|      120|           72|
|           0.62|03/05/2019 08:46:48|       0.57|      120|           57|
|           0.67|03/05/2019 08:51:48|       0.44|      120|           78|
|           0.67|03/05/2019 08:56:48| 

In [50]:
df_sql1.count()

10000

In [51]:
df_sql1 = spark.sql("SELECT server_id, session_count FROM utilization WHERE session_count > 70")
df_sql1.show()

+---------+-------------+
|server_id|session_count|
+---------+-------------+
|      100|           71|
|      100|           71|
|      100|           71|
|      100|           71|
|      100|           72|
|      100|           72|
|      100|           71|
|      100|           71|
|      100|           71|
|      100|           72|
|      100|           71|
|      100|           72|
|      100|           71|
|      100|           71|
|      100|           72|
|      100|           71|
|      100|           72|
|      100|           72|
|      100|           71|
|      100|           71|
+---------+-------------+
only showing top 20 rows



In [52]:
df_sql1.count()

239659

In [53]:
df_sql1 = spark.sql("SELECT server_id, session_count FROM utilization WHERE session_count > 70 AND server_id = 120")
df_sql1.show()

+---------+-------------+
|server_id|session_count|
+---------+-------------+
|      120|           80|
|      120|           71|
|      120|           73|
|      120|           72|
|      120|           78|
|      120|           73|
|      120|           78|
|      120|           73|
|      120|           74|
|      120|           78|
|      120|           75|
|      120|           75|
|      120|           73|
|      120|           79|
|      120|           72|
|      120|           77|
|      120|           75|
|      120|           72|
|      120|           79|
|      120|           75|
+---------+-------------+
only showing top 20 rows



In [54]:
df_sql1.count()

2733

In [55]:
df_sql1 = spark.sql("SELECT server_id, session_count \
                     FROM utilization \
                     WHERE server_id = 120 AND session_count > 70 \
                     ORDER BY session_count DESC")

df_sql1.show()

+---------+-------------+
|server_id|session_count|
+---------+-------------+
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
|      120|           80|
+---------+-------------+
only showing top 20 rows



#### Aggregating DataFrames with SQL

In [56]:
df3.show(10)

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|
|           0.41|03/05/2019 08:41:14|        0.4|      100|           58|
|           0.53|03/05/2019 08:46:14|       0.35|      100|           62|
|           0.51|03/05/2019 08:51:14|        0.6|      100|           45|
+---------------+-------------------+-

In [57]:
df_count = spark.sql("SELECT count(*) FROM utilization")
df_count.show()

+--------+
|count(1)|
+--------+
|  500000|
+--------+



In [58]:
df_count1 = spark.sql("SELECT count(*) \
                       FROM utilization \
                       WHERE session_count > 70")

df_count1.show()

+--------+
|count(1)|
+--------+
|  239659|
+--------+



In [59]:
df_group = spark.sql("SELECT server_id, count(*) \
                      FROM utilization \
                      WHERE session_count > 70 \
                      GROUP BY server_id \
                      ORDER BY count(*)")

df_group.show()

+---------+--------+
|server_id|count(1)|
+---------+--------+
|      143|     144|
|      100|     391|
|      105|    1110|
|      116|    1167|
|      135|    1654|
|      147|    1783|
|      132|    2048|
|      114|    2128|
|      134|    2147|
|      120|    2733|
|      110|    2826|
|      125|    2843|
|      130|    2891|
|      111|    3093|
|      141|    3097|
|      109|    3129|
|      129|    3222|
|      117|    3605|
|      128|    3719|
|      136|    4316|
+---------+--------+
only showing top 20 rows



In [60]:
df_group1 = spark.sql("SELECT server_id, round(avg(session_count), 2), max(session_count), count(*) \
                       FROM utilization \
                       WHERE session_count > 70 \
                       GROUP BY server_id \
                       ORDER BY count(*) DESC")

df_group1.show()

+---------+----------------------------+------------------+--------+
|server_id|round(avg(session_count), 2)|max(session_count)|count(1)|
+---------+----------------------------+------------------+--------+
|      101|                       87.67|               105|    9808|
|      113|                       86.96|               103|    9418|
|      145|                       86.98|               103|    9304|
|      103|                       85.76|               101|    8744|
|      102|                       85.71|               101|    8586|
|      133|                       85.47|               100|    8583|
|      108|                       85.12|               100|    8375|
|      149|                       84.96|                99|    8288|
|      137|                       85.01|                99|    8248|
|      148|                        84.7|                99|    8027|
|      123|                       84.53|                98|    7918|
|      118|                       

#### Joining DataFrames with SQL

In [61]:
df3.show()

+---------------+-------------------+-----------+---------+-------------+
|cpu_utilization|     event_datetime|free_memory|server_id|session_count|
+---------------+-------------------+-----------+---------+-------------+
|           0.57|03/05/2019 08:06:14|       0.51|      100|           47|
|           0.47|03/05/2019 08:11:14|       0.62|      100|           43|
|           0.56|03/05/2019 08:16:14|       0.57|      100|           62|
|           0.57|03/05/2019 08:21:14|       0.56|      100|           50|
|           0.35|03/05/2019 08:26:14|       0.46|      100|           43|
|           0.41|03/05/2019 08:31:14|       0.58|      100|           48|
|           0.57|03/05/2019 08:36:14|       0.35|      100|           58|
|           0.41|03/05/2019 08:41:14|        0.4|      100|           58|
|           0.53|03/05/2019 08:46:14|       0.35|      100|           62|
|           0.51|03/05/2019 08:51:14|        0.6|      100|           45|
|           0.32|03/05/2019 08:56:14| 

In [62]:
df_server_name = spark.read.format("csv").option("header", "true").load(DATA_PATH + "/server_name.csv")

In [63]:
df_server_name.show()

+---------+-----------+
|server_id|server_name|
+---------+-----------+
|      100| 100 Server|
|      101| 101 Server|
|      102| 102 Server|
|      103| 103 Server|
|      104| 104 Server|
|      105| 105 Server|
|      106| 106 Server|
|      107| 107 Server|
|      108| 108 Server|
|      109| 109 Server|
|      110| 110 Server|
|      111| 111 Server|
|      112| 112 Server|
|      113| 113 Server|
|      114| 114 Server|
|      115| 115 Server|
|      116| 116 Server|
|      117| 117 Server|
|      118| 118 Server|
|      119| 119 Server|
+---------+-----------+
only showing top 20 rows



In [64]:
# check the available temporary tables
spark.catalog.listTables()

[Table(name='utilization', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [65]:
df_server_name.createOrReplaceTempView("server_name")

In [66]:
spark.catalog.listTables()

[Table(name='server_name', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='utilization', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [67]:
df_count = spark.sql("SELECT distinct(server_id) FROM utilization")
df_count.show()

+---------+
|server_id|
+---------+
|      112|
|      113|
|      130|
|      126|
|      149|
|      110|
|      136|
|      144|
|      119|
|      116|
|      145|
|      124|
|      143|
|      107|
|      146|
|      103|
|      139|
|      138|
|      114|
|      115|
+---------+
only showing top 20 rows



In [68]:
spark.sql("SELECT min(server_id), max(server_id) FROM utilization").show()

+--------------+--------------+
|min(server_id)|max(server_id)|
+--------------+--------------+
|           100|           149|
+--------------+--------------+



In [69]:
df_join = spark.sql("SELECT u.server_id, sn.server_name, u.session_count \
                     FROM utilization u \
                     INNER JOIN server_name sn \
                     ON u.server_id = sn.server_id")
df_join.show()

+---------+-----------+-------------+
|server_id|server_name|session_count|
+---------+-----------+-------------+
|      100| 100 Server|           47|
|      100| 100 Server|           43|
|      100| 100 Server|           62|
|      100| 100 Server|           50|
|      100| 100 Server|           43|
|      100| 100 Server|           48|
|      100| 100 Server|           58|
|      100| 100 Server|           58|
|      100| 100 Server|           62|
|      100| 100 Server|           45|
|      100| 100 Server|           47|
|      100| 100 Server|           60|
|      100| 100 Server|           57|
|      100| 100 Server|           44|
|      100| 100 Server|           47|
|      100| 100 Server|           66|
|      100| 100 Server|           65|
|      100| 100 Server|           66|
|      100| 100 Server|           42|
|      100| 100 Server|           63|
+---------+-----------+-------------+
only showing top 20 rows



#### Eliminating duplicates in DataFrames

In [70]:
from pyspark.sql import Row

In [71]:
sc = spark.sparkContext

In [72]:
# create DataFrame from RDD
rdd = sc.parallelize([Row(server_name="101 Server", cpu_utilization=85, session_count=80),
                         Row(server_name="101 Server", cpu_utilization=80, session_count=90),
                         Row(server_name="102 Server", cpu_utilization=85, session_count=80),
                         Row(server_name="102 Server", cpu_utilization=85, session_count=80)])

In [73]:
rdd.collect()

[Row(cpu_utilization=85, server_name='101 Server', session_count=80),
 Row(cpu_utilization=80, server_name='101 Server', session_count=90),
 Row(cpu_utilization=85, server_name='102 Server', session_count=80),
 Row(cpu_utilization=85, server_name='102 Server', session_count=80)]

In [74]:
# df_dup = rdd.toDF()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 98.0 failed 1 times, most recent failure: Lost task 0.0 in stage 98.0 (TID 1427, localhost, executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:109)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.net.PlainSocketImpl.accept(Unknown Source)
	at java.net.ServerSocket.implAccept(Unknown Source)
	at java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:164)
	... 14 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:153)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:109)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.net.PlainSocketImpl.accept(Unknown Source)
	at java.net.ServerSocket.implAccept(Unknown Source)
	at java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:164)
	... 14 more


Saya tidak dapat mengkonversi rdd ke DataFrames dengan metode .toDF, mendapatkan pesan eror.

In [82]:
data = [["101 Server", 85, 80],
        ["101 Server", 80, 90],
        ["102 Server", 85, 80],
        ["102 Server", 85, 80]]
# schema = " server_name STRING, cpu_utilization INT, session_count INT"

In [87]:
df_dup = spark.createDataFrame(data)

In [80]:
df_dup

DataFrame[server_name: string, cpu_utilization: int, session_count: int]

In [88]:
df_dup.show()

Py4JJavaError: An error occurred while calling o460.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 105.0 failed 1 times, most recent failure: Lost task 0.0 in stage 105.0 (TID 1434, localhost, executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:109)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.net.PlainSocketImpl.accept(Unknown Source)
	at java.net.ServerSocket.implAccept(Unknown Source)
	at java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:164)
	... 35 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.GeneratedMethodAccessor62.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:109)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.net.PlainSocketImpl.accept(Unknown Source)
	at java.net.ServerSocket.implAccept(Unknown Source)
	at java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:164)
	... 35 more


Saya berhasil membuat dataframe dengan spark.createDataFrame(), namun menagalami eror ketika ingin menampilkan dengan .show()

Dengan memuat data dari file csv, semuanya berjalan dengan baik.

In [90]:
df_dup = spark.read.format("csv").option("header", "true").load("duplicate.csv")

In [91]:
df_dup.show()

+-----------+----------------+--------------+
|server_name| cpu_utilization| session_count|
+-----------+----------------+--------------+
| 101 Server|              85|            80|
| 101 Server|              80|            90|
| 102 Server|              85|            80|
| 102 Server|              85|            80|
+-----------+----------------+--------------+



In [92]:
df_dup.drop_duplicates().show()

+-----------+----------------+--------------+
|server_name| cpu_utilization| session_count|
+-----------+----------------+--------------+
| 101 Server|              85|            80|
| 101 Server|              80|            90|
| 102 Server|              85|            80|
+-----------+----------------+--------------+



In [94]:
df_dup.drop_duplicates(["server_name"]).show()

+-----------+----------------+--------------+
|server_name| cpu_utilization| session_count|
+-----------+----------------+--------------+
| 102 Server|              85|            80|
| 101 Server|              85|            80|
+-----------+----------------+--------------+



#### Working with NA Values

In [95]:
from pyspark.sql.functions import lit
from pyspark.sql.types import StringType

In [96]:
df_na = df_dup.withColumn("na_col", lit(None).cast(StringType()))

In [97]:
df_na.show()

+-----------+----------------+--------------+------+
|server_name| cpu_utilization| session_count|na_col|
+-----------+----------------+--------------+------+
| 101 Server|              85|            80|  null|
| 101 Server|              80|            90|  null|
| 102 Server|              85|            80|  null|
| 102 Server|              85|            80|  null|
+-----------+----------------+--------------+------+



In [98]:
df_na.fillna('A').show()

+-----------+----------------+--------------+------+
|server_name| cpu_utilization| session_count|na_col|
+-----------+----------------+--------------+------+
| 101 Server|              85|            80|     A|
| 101 Server|              80|            90|     A|
| 102 Server|              85|            80|     A|
| 102 Server|              85|            80|     A|
+-----------+----------------+--------------+------+



In [100]:
df = df_na.fillna('A').union(df_na)

In [101]:
df.show()

+-----------+----------------+--------------+------+
|server_name| cpu_utilization| session_count|na_col|
+-----------+----------------+--------------+------+
| 101 Server|              85|            80|     A|
| 101 Server|              80|            90|     A|
| 102 Server|              85|            80|     A|
| 102 Server|              85|            80|     A|
| 101 Server|              85|            80|  null|
| 101 Server|              80|            90|  null|
| 102 Server|              85|            80|  null|
| 102 Server|              85|            80|  null|
+-----------+----------------+--------------+------+



In [102]:
df.na.drop().show()

+-----------+----------------+--------------+------+
|server_name| cpu_utilization| session_count|na_col|
+-----------+----------------+--------------+------+
| 101 Server|              85|            80|     A|
| 101 Server|              80|            90|     A|
| 102 Server|              85|            80|     A|
| 102 Server|              85|            80|     A|
+-----------+----------------+--------------+------+



In [103]:
df.createOrReplaceTempView("na_table")

In [105]:
spark.sql("SELECT * FROM na_table WHERE na_col IS NULL").show()

+-----------+----------------+--------------+------+
|server_name| cpu_utilization| session_count|na_col|
+-----------+----------------+--------------+------+
| 101 Server|              85|            80|  null|
| 101 Server|              80|            90|  null|
| 102 Server|              85|            80|  null|
| 102 Server|              85|            80|  null|
+-----------+----------------+--------------+------+



In [106]:
spark.sql("SELECT * FROM na_table WHERE na_col IS NOT NULL").show()

+-----------+----------------+--------------+------+
|server_name| cpu_utilization| session_count|na_col|
+-----------+----------------+--------------+------+
| 101 Server|              85|            80|     A|
| 101 Server|              80|            90|     A|
| 102 Server|              85|            80|     A|
| 102 Server|              85|            80|     A|
+-----------+----------------+--------------+------+

