In [2]:
# import libraries
import pyspark
import pandas as pd


from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import upper, col

In [3]:
# inisialize pyspark
spark = SparkSession.builder.appName('Pyspark Tutorial').getOrCreate()

In [4]:
spark

In [5]:
# example create data orders
data_orders = [
    (1001, "C001", "Laptop", 12000, "2024-03-01"),
    (1002, "C002", "Smartphone", 8000, "2024-03-02"),
    (1003, "C003", "Monitor", 3000, "2024-03-03")
]

columns_orders = ["order_id", "customer_id", "product", "amount", "order_date"]
df_orders = spark.createDataFrame(data_orders, columns_orders)

# example create data customers
data_customers = [
    ("C001", "Andi", "Jakarta"),
    ("C002", "Budi", "Bandung"),
    ("C003", "Citra", "Surabaya")
]

columns_customers = ["customer_id", "name", "city"]
df_customers = spark.createDataFrame(data_customers, columns_customers)

In [6]:
# show data orders
df_orders.show()

+--------+-----------+----------+------+----------+
|order_id|customer_id|   product|amount|order_date|
+--------+-----------+----------+------+----------+
|    1001|       C001|    Laptop| 12000|2024-03-01|
|    1002|       C002|Smartphone|  8000|2024-03-02|
|    1003|       C003|   Monitor|  3000|2024-03-03|
+--------+-----------+----------+------+----------+



In [7]:
# show data customers
df_customers.show()

+-----------+-----+--------+
|customer_id| name|    city|
+-----------+-----+--------+
|       C001| Andi| Jakarta|
|       C002| Budi| Bandung|
|       C003|Citra|Surabaya|
+-----------+-----+--------+



In [8]:
# show data vertical 
df_customers.show(vertical=True)

-RECORD 0---------------
 customer_id | C001     
 name        | Andi     
 city        | Jakarta  
-RECORD 1---------------
 customer_id | C002     
 name        | Budi     
 city        | Bandung  
-RECORD 2---------------
 customer_id | C003     
 name        | Citra    
 city        | Surabaya 



In [9]:
# show data vertical 
df_customers.show(1, vertical=True)

-RECORD 0--------------
 customer_id | C001    
 name        | Andi    
 city        | Jakarta 
only showing top 1 row



In [10]:
# print schema
df_orders.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: long (nullable = true)
 |-- order_date: string (nullable = true)



In [11]:
# selecting columns
df_customers.name

Column<'name'>

In [12]:
# select columns and show
df_customers.select(df_customers.name).show()

+-----+
| name|
+-----+
| Andi|
| Budi|
|Citra|
+-----+



In [13]:
# filtering data
df_customers.filter(df_customers.name == 'Andi').show()

+-----------+----+-------+
|customer_id|name|   city|
+-----------+----+-------+
|       C001|Andi|Jakarta|
+-----------+----+-------+



In [14]:
# join data
df_all = df_orders.join(df_customers, on='customer_id', how='inner')
df_all.show()

+-----------+--------+----------+------+----------+-----+--------+
|customer_id|order_id|   product|amount|order_date| name|    city|
+-----------+--------+----------+------+----------+-----+--------+
|       C001|    1001|    Laptop| 12000|2024-03-01| Andi| Jakarta|
|       C002|    1002|Smartphone|  8000|2024-03-02| Budi| Bandung|
|       C003|    1003|   Monitor|  3000|2024-03-03|Citra|Surabaya|
+-----------+--------+----------+------+----------+-----+--------+



In [15]:
# read json file
dataset= '/home/jovyan/work/data/nyt2.json'
data = spark.read.json(dataset)

In [16]:
data.show(3)

+--------------------+--------------------+---------------+-----------------+--------------------+-------------+-----------------+-------------+----+--------------+--------------------+-------------+
|                 _id|  amazon_product_url|         author| bestsellers_date|         description|        price|   published_date|    publisher|rank|rank_last_week|               title|weeks_on_list|
+--------------------+--------------------+---------------+-----------------+--------------------+-------------+-----------------+-------------+----+--------------+--------------------+-------------+
|{5b4aa4ead3089013...|http://www.amazon...|  Dean R Koontz|{{1211587200000}}|Odd Thomas, who c...|   {NULL, 27}|{{1212883200000}}|       Bantam| {1}|           {0}|           ODD HOURS|          {1}|
|{5b4aa4ead3089013...|http://www.amazon...|Stephenie Meyer|{{1211587200000}}|Aliens have taken...|{25.99, NULL}|{{1212883200000}}|Little, Brown| {2}|           {1}|            THE HOST|          {3}|


In [17]:
# show columns
data.columns

['_id',
 'amazon_product_url',
 'author',
 'bestsellers_date',
 'description',
 'price',
 'published_date',
 'publisher',
 'rank',
 'rank_last_week',
 'title',
 'weeks_on_list']

In [18]:
data.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- $oid: string (nullable = true)
 |-- amazon_product_url: string (nullable = true)
 |-- author: string (nullable = true)
 |-- bestsellers_date: struct (nullable = true)
 |    |-- $date: struct (nullable = true)
 |    |    |-- $numberLong: string (nullable = true)
 |-- description: string (nullable = true)
 |-- price: struct (nullable = true)
 |    |-- $numberDouble: string (nullable = true)
 |    |-- $numberInt: string (nullable = true)
 |-- published_date: struct (nullable = true)
 |    |-- $date: struct (nullable = true)
 |    |    |-- $numberLong: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- rank: struct (nullable = true)
 |    |-- $numberInt: string (nullable = true)
 |-- rank_last_week: struct (nullable = true)
 |    |-- $numberInt: string (nullable = true)
 |-- title: string (nullable = true)
 |-- weeks_on_list: struct (nullable = true)
 |    |-- $numberInt: string (nullable = true)



In [19]:
data.select('title').show(5)

+--------------------+
|               title|
+--------------------+
|           ODD HOURS|
|            THE HOST|
|LOVE THE ONE YOU'...|
|           THE FRONT|
|               SNUFF|
+--------------------+
only showing top 5 rows



In [20]:
# show several columns
data.select("author", "title", "rank", "price").show(10)

+--------------------+--------------------+----+-------------+
|              author|               title|rank|        price|
+--------------------+--------------------+----+-------------+
|       Dean R Koontz|           ODD HOURS| {1}|   {NULL, 27}|
|     Stephenie Meyer|            THE HOST| {2}|{25.99, NULL}|
|        Emily Giffin|LOVE THE ONE YOU'...| {3}|{24.95, NULL}|
|   Patricia Cornwell|           THE FRONT| {4}|{22.95, NULL}|
|     Chuck Palahniuk|               SNUFF| {5}|{24.95, NULL}|
|James Patterson a...|SUNDAYS AT TIFFANY’S| {6}|{24.99, NULL}|
|       John Sandford|        PHANTOM PREY| {7}|{26.95, NULL}|
|       Jimmy Buffett|          SWINE NOT?| {8}|{21.99, NULL}|
|    Elizabeth George|     CARELESS IN RED| {9}|{27.95, NULL}|
|      David Baldacci|     THE WHOLE TRUTH|{10}|{26.99, NULL}|
+--------------------+--------------------+----+-------------+
only showing top 10 rows



In [21]:
# create new column 
data = data.withColumn('author_upper', upper(data.author))

In [22]:
data.show(3)

+--------------------+--------------------+---------------+-----------------+--------------------+-------------+-----------------+-------------+----+--------------+--------------------+-------------+---------------+
|                 _id|  amazon_product_url|         author| bestsellers_date|         description|        price|   published_date|    publisher|rank|rank_last_week|               title|weeks_on_list|   author_upper|
+--------------------+--------------------+---------------+-----------------+--------------------+-------------+-----------------+-------------+----+--------------+--------------------+-------------+---------------+
|{5b4aa4ead3089013...|http://www.amazon...|  Dean R Koontz|{{1211587200000}}|Odd Thomas, who c...|   {NULL, 27}|{{1212883200000}}|       Bantam| {1}|           {0}|           ODD HOURS|          {1}|  DEAN R KOONTZ|
|{5b4aa4ead3089013...|http://www.amazon...|Stephenie Meyer|{{1211587200000}}|Aliens have taken...|{25.99, NULL}|{{1212883200000}}|Little

In [23]:
data.columns

['_id',
 'amazon_product_url',
 'author',
 'bestsellers_date',
 'description',
 'price',
 'published_date',
 'publisher',
 'rank',
 'rank_last_week',
 'title',
 'weeks_on_list',
 'author_upper']

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 36396)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/usr/local/spark/python/pyspark/accumulators.py", line 271, in accum_updates
    num_updates =