## Section 19 Getting started with Semi Structured Data

In [1]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

In [2]:
username

'itv011204'

In [5]:
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
    enableHiveSupport(). \
    appName(f'{username} | Section 19 Semi Structured Data'). \
    master('yarn'). \
    getOrCreate()
    

In [6]:
spark.conf.set('spark.sql.shuffle.partitions','2')

In [7]:
spark.sparkContext.getConf().getAll()

[('spark.eventLog.enabled', 'true'),
 ('spark.sql.repl.eagerEval.enabled', 'true'),
 ('spark.app.name', 'itv011204 | Section 19 Semi Structured Data'),
 ('spark.eventLog.dir', 'hdfs:///spark-logs'),
 ('spark.dynamicAllocation.maxExecutors', '10'),
 ('spark.yarn.historyServer.address', 'm02.itversity.com:18080'),
 ('spark.yarn.jars', ''),
 ('spark.history.provider',
  'org.apache.spark.deploy.history.FsHistoryProvider'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.history.fs.logDirectory', 'hdfs:///spark-logs'),
 ('spark.submit.deployMode', 'client'),
 ('spark.history.fs.update.interval', '10s'),
 ('spark.driver.extraJavaOptions', '-Dderby.system.home=/tmp/derby/'),
 ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',
  'm02.itversity.com'),
 ('spark.ui.filters',
  'org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter'),
 ('spark.driver.appUIAddress', 'http://g02.itversity.com:44379'),
 ('spark.executor.extraLibraryPath', '/opt/hadoo

In [8]:
spark.sparkContext.getConf().get('spark.sql.shuffle.partitions')

In [9]:
spark.sql(f'CREATE DATABASE IF NOT EXISTS {username}_demo')

In [10]:
spark.sql(f'USE {username}_demo')

In [11]:
spark.sql('DROP TABLE IF EXISTS employees')

In [12]:
# Array python list

type(['+1 123 456 7890','+91 1234 567890'])

list

In [13]:
# Array json list

type(["+1 123 456 7890","+91 1234 567890"])

list

In [14]:
{"home":"+1 123 456 7890","business":"+91 1234 567890" }

{'home': '+1 123 456 7890', 'business': '+91 1234 567890'}

In [15]:
{"Street Address": "Street 1":, "City":"Round Rock", "State":"TX", "Zip Code", 12345 }

SyntaxError: invalid syntax (<ipython-input-15-205169211310>, line 1)

In [16]:
spark.sql("""
    CREATE TABLE employees(
        employee_id INT,
        employee_first_name STRING,
        employee_last_name STRING,
        employee_salary FLOAT,
        employee_nationality STRING,
        employee_email_ids ARRAY<STRING>,
        employee_phone_numbers MAP<STRING, STRING>,
        employee_ssn STRING,
        employee_address STRUCT<street: STRING, city: STRING, state: STRING, postal_code: INT>
    )
""")

In [17]:
spark.sql("SHOW TABLES")

database,tableName,isTemporary
itv011204_demo,employees,False


In [18]:
spark.sql("DESCRIBE employees").show(truncate=False)

+----------------------+--------------------------------------------------------------+-------+
|col_name              |data_type                                                     |comment|
+----------------------+--------------------------------------------------------------+-------+
|employee_id           |int                                                           |null   |
|employee_first_name   |string                                                        |null   |
|employee_last_name    |string                                                        |null   |
|employee_salary       |float                                                         |null   |
|employee_nationality  |string                                                        |null   |
|employee_email_ids    |array<string>                                                 |null   |
|employee_phone_numbers|map<string,string>                                            |null   |
|employee_ssn          |string          

In [19]:
MAP<STRING , ARRAY<STRING>>

SyntaxError: invalid syntax (<ipython-input-19-f670835ac232>, line 1)

In [20]:
spark.sql("""
    INSERT INTO employees
    VALUES(1, 'Scott', 'Tiger', 1000.0, 'United States',
        ARRAY('scott@tiger.com','stiger@companyx.com'),
        MAP('Home','+1 234 567 8901', 'Office','+1 345 678 9012'), '789 12 6118',
        STRUCT('1234 ABC St', 'My City', 'My State', 13455)
    )
""")

In [21]:
spark.sql("SELECT employee_id, employee_email_ids, employee_phone_numbers, employee_address FROM employees"). \
    show(truncate=False)

+-----------+--------------------------------------+----------------------------------------------------+---------------------------------------+
|employee_id|employee_email_ids                    |employee_phone_numbers                              |employee_address                       |
+-----------+--------------------------------------+----------------------------------------------------+---------------------------------------+
|1          |[scott@tiger.com, stiger@companyx.com]|{Home -> +1 234 567 8901, Office -> +1 345 678 9012}|{1234 ABC St, My City, My State, 13455}|
+-----------+--------------------------------------+----------------------------------------------------+---------------------------------------+



In [22]:
employees = [
    (2, "Henry", "Ford", 1250.0, "India",
     ['henry@ford.com', 'hford@companyx.com'],
     {"Home": "+91 234 567 8901", "Office": "+91 345 678 9012"},
     "456 78 9123",('111 BCD Cir', 'Some City', 'Some State', 500091)
    ),
    (3, "Nick", "Junior", 750.0, "United Kingdom",
     ['nick@junior.com', 'njunior@companyx.com'],
     {"Home": "+44 111 111 1111", "Office": "+44 222 222 2222"},
     "222 33 4444",('222 Giant Cly', 'UK City', 'UK Province', None)
    ),
    (4, "Bill", "Gomes", 1500.0, "Australia",
     ['bill@gomes.com','bgomes@companyx.com'],
     {"Home":"+61 987 654 3210", "Office": "+61 876 543 2109"},
     "789 12 6118", None
    ),
    (5, "Saravanan", "Krishnarajan", 1600.0, "Sweden",
     ['saravanan@krishnarajan.com','skrishnarajan@companyx.com'],
     {"Home":"+46 987 654 3210", "Office": "+46 876 543 2109","Mobile":"+46 222 222222"},
     "789 12 6118", None
    )
]

In [23]:
type(employees)

list

In [24]:
employees[0]

(2,
 'Henry',
 'Ford',
 1250.0,
 'India',
 ['henry@ford.com', 'hford@companyx.com'],
 {'Home': '+91 234 567 8901', 'Office': '+91 345 678 9012'},
 '456 78 9123',
 ('111 BCD Cir', 'Some City', 'Some State', 500091))

In [25]:
employees[0][0]

2

In [26]:
employees[0][1]

'Henry'

In [27]:
employees[:][1]

(3,
 'Nick',
 'Junior',
 750.0,
 'United Kingdom',
 ['nick@junior.com', 'njunior@companyx.com'],
 {'Home': '+44 111 111 1111', 'Office': '+44 222 222 2222'},
 '222 33 4444',
 ('222 Giant Cly', 'UK City', 'UK Province', None))

In [28]:
employees_df = spark.createDataFrame(
    employees,
    schema = """
        employee_id INT, employee_first_name STRING, employee_last_name STRING, 
        employee_salary FLOAT, employee_nationality STRING,
        employee_email_ids ARRAY<STRING>,
        employee_phone_numbers MAP<STRING, STRING>,
        employee_ssn STRING,
        employee_address STRUCT<street: STRING, city: STRING, state: STRING, postal_code: INT>
    """
)

In [29]:
employees_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- employee_first_name: string (nullable = true)
 |-- employee_last_name: string (nullable = true)
 |-- employee_salary: float (nullable = true)
 |-- employee_nationality: string (nullable = true)
 |-- employee_email_ids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- employee_phone_numbers: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- employee_ssn: string (nullable = true)
 |-- employee_address: struct (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- postal_code: integer (nullable = true)



In [30]:
employees_df.select('employee_email_ids').show(truncate=False)

+--------------------------------------------------------+
|employee_email_ids                                      |
+--------------------------------------------------------+
|[henry@ford.com, hford@companyx.com]                    |
|[nick@junior.com, njunior@companyx.com]                 |
|[bill@gomes.com, bgomes@companyx.com]                   |
|[saravanan@krishnarajan.com, skrishnarajan@companyx.com]|
+--------------------------------------------------------+



In [31]:
employees_df.select('employee_phone_numbers').show(truncate=False)

+--------------------------------------------------------------------------------+
|employee_phone_numbers                                                          |
+--------------------------------------------------------------------------------+
|{Office -> +91 345 678 9012, Home -> +91 234 567 8901}                          |
|{Office -> +44 222 222 2222, Home -> +44 111 111 1111}                          |
|{Office -> +61 876 543 2109, Home -> +61 987 654 3210}                          |
|{Office -> +46 876 543 2109, Mobile -> +46 222 222222, Home -> +46 987 654 3210}|
+--------------------------------------------------------------------------------+



In [32]:
employees_df.select('employee_address').show(truncate=False)

+--------------------------------------------+
|employee_address                            |
+--------------------------------------------+
|{111 BCD Cir, Some City, Some State, 500091}|
|{222 Giant Cly, UK City, UK Province, null} |
|null                                        |
|null                                        |
+--------------------------------------------+



In [33]:
spark.sql("DESCRIBE employees").show(truncate=False)

+----------------------+--------------------------------------------------------------+-------+
|col_name              |data_type                                                     |comment|
+----------------------+--------------------------------------------------------------+-------+
|employee_id           |int                                                           |null   |
|employee_first_name   |string                                                        |null   |
|employee_last_name    |string                                                        |null   |
|employee_salary       |float                                                         |null   |
|employee_nationality  |string                                                        |null   |
|employee_email_ids    |array<string>                                                 |null   |
|employee_phone_numbers|map<string,string>                                            |null   |
|employee_ssn          |string          

In [34]:
employees_df.write.insertInto('employees')

In [35]:
spark.read.table('employees').show(truncate=False)

+-----------+-------------------+------------------+---------------+--------------------+--------------------------------------------------------+--------------------------------------------------------------------------------+------------+--------------------------------------------+
|employee_id|employee_first_name|employee_last_name|employee_salary|employee_nationality|employee_email_ids                                      |employee_phone_numbers                                                          |employee_ssn|employee_address                            |
+-----------+-------------------+------------------+---------------+--------------------+--------------------------------------------------------+--------------------------------------------------------------------------------+------------+--------------------------------------------+
|1          |Scott              |Tiger             |1000.0         |United States       |[scott@tiger.com, stiger@companyx.com]               

In [36]:
spark.read.table('employees').select('employee_email_ids').show(truncate=False)

+--------------------------------------------------------+
|employee_email_ids                                      |
+--------------------------------------------------------+
|[scott@tiger.com, stiger@companyx.com]                  |
|[henry@ford.com, hford@companyx.com]                    |
|[nick@junior.com, njunior@companyx.com]                 |
|[bill@gomes.com, bgomes@companyx.com]                   |
|[saravanan@krishnarajan.com, skrishnarajan@companyx.com]|
+--------------------------------------------------------+



In [37]:
spark.read.table('employees').select('employee_phone_numbers').show(truncate=False)

+--------------------------------------------------------------------------------+
|employee_phone_numbers                                                          |
+--------------------------------------------------------------------------------+
|{Home -> +1 234 567 8901, Office -> +1 345 678 9012}                            |
|{Home -> +91 234 567 8901, Office -> +91 345 678 9012}                          |
|{Home -> +44 111 111 1111, Office -> +44 222 222 2222}                          |
|{Home -> +61 987 654 3210, Office -> +61 876 543 2109}                          |
|{Home -> +46 987 654 3210, Mobile -> +46 222 222222, Office -> +46 876 543 2109}|
+--------------------------------------------------------------------------------+



In [38]:
spark.sql("SELECT * FROM employees").show()

+-----------+-------------------+------------------+---------------+--------------------+--------------------+----------------------+------------+--------------------+
|employee_id|employee_first_name|employee_last_name|employee_salary|employee_nationality|  employee_email_ids|employee_phone_numbers|employee_ssn|    employee_address|
+-----------+-------------------+------------------+---------------+--------------------+--------------------+----------------------+------------+--------------------+
|          1|              Scott|             Tiger|         1000.0|       United States|[scott@tiger.com,...|  {Home -> +1 234 5...| 789 12 6118|{1234 ABC St, My ...|
|          2|              Henry|              Ford|         1250.0|               India|[henry@ford.com, ...|  {Home -> +91 234 ...| 456 78 9123|{111 BCD Cir, Som...|
|          3|               Nick|            Junior|          750.0|      United Kingdom|[nick@junior.com,...|  {Home -> +44 111 ...| 222 33 4444|{222 Giant Cly

In [39]:
spark.sql("""
            SELECT employee_id, employee_email_ids, employee_phone_numbers, employee_address FROM employees
            WHERE employee_id<>4
          """).show(truncate=False)

+-----------+--------------------------------------------------------+--------------------------------------------------------------------------------+--------------------------------------------+
|employee_id|employee_email_ids                                      |employee_phone_numbers                                                          |employee_address                            |
+-----------+--------------------------------------------------------+--------------------------------------------------------------------------------+--------------------------------------------+
|1          |[scott@tiger.com, stiger@companyx.com]                  |{Home -> +1 234 567 8901, Office -> +1 345 678 9012}                            |{1234 ABC St, My City, My State, 13455}     |
|2          |[henry@ford.com, hford@companyx.com]                    |{Home -> +91 234 567 8901, Office -> +91 345 678 9012}                          |{111 BCD Cir, Some City, Some State, 500091}|
|3          |[n

In [40]:
spark.sql("""
            SELECT employee_id, employee_email_ids, employee_address FROM employees
          """).show(truncate=False)

+-----------+--------------------------------------------------------+--------------------------------------------+
|employee_id|employee_email_ids                                      |employee_address                            |
+-----------+--------------------------------------------------------+--------------------------------------------+
|1          |[scott@tiger.com, stiger@companyx.com]                  |{1234 ABC St, My City, My State, 13455}     |
|2          |[henry@ford.com, hford@companyx.com]                    |{111 BCD Cir, Some City, Some State, 500091}|
|3          |[nick@junior.com, njunior@companyx.com]                 |{222 Giant Cly, UK City, UK Province, null} |
|4          |[bill@gomes.com, bgomes@companyx.com]                   |null                                        |
|5          |[saravanan@krishnarajan.com, skrishnarajan@companyx.com]|null                                        |
+-----------+--------------------------------------------------------+--

In [41]:
for i in employees_df:
    print(i)

Column<'employee_id'>
Column<'employee_first_name'>
Column<'employee_last_name'>
Column<'employee_salary'>
Column<'employee_nationality'>
Column<'employee_email_ids'>
Column<'employee_phone_numbers'>
Column<'employee_ssn'>
Column<'employee_address'>


In [42]:
employees_df.toJSON().first()

'{"employee_id":2,"employee_first_name":"Henry","employee_last_name":"Ford","employee_salary":1250.0,"employee_nationality":"India","employee_email_ids":["henry@ford.com","hford@companyx.com"],"employee_phone_numbers":{"Office":"+91 345 678 9012","Home":"+91 234 567 8901"},"employee_ssn":"456 78 9123","employee_address":{"street":"111 BCD Cir","city":"Some City","state":"Some State","postal_code":500091}}'

In [43]:
employees_df.toJSON().first()

'{"employee_id":2,"employee_first_name":"Henry","employee_last_name":"Ford","employee_salary":1250.0,"employee_nationality":"India","employee_email_ids":["henry@ford.com","hford@companyx.com"],"employee_phone_numbers":{"Office":"+91 345 678 9012","Home":"+91 234 567 8901"},"employee_ssn":"456 78 9123","employee_address":{"street":"111 BCD Cir","city":"Some City","state":"Some State","postal_code":500091}}'

In [44]:
employees_df.toJSON().collect()

['{"employee_id":2,"employee_first_name":"Henry","employee_last_name":"Ford","employee_salary":1250.0,"employee_nationality":"India","employee_email_ids":["henry@ford.com","hford@companyx.com"],"employee_phone_numbers":{"Office":"+91 345 678 9012","Home":"+91 234 567 8901"},"employee_ssn":"456 78 9123","employee_address":{"street":"111 BCD Cir","city":"Some City","state":"Some State","postal_code":500091}}',
 '{"employee_id":3,"employee_first_name":"Nick","employee_last_name":"Junior","employee_salary":750.0,"employee_nationality":"United Kingdom","employee_email_ids":["nick@junior.com","njunior@companyx.com"],"employee_phone_numbers":{"Office":"+44 222 222 2222","Home":"+44 111 111 1111"},"employee_ssn":"222 33 4444","employee_address":{"street":"222 Giant Cly","city":"UK City","state":"UK Province"}}',
 '{"employee_id":4,"employee_first_name":"Bill","employee_last_name":"Gomes","employee_salary":1500.0,"employee_nationality":"Australia","employee_email_ids":["bill@gomes.com","bgomes@

In [45]:
!ls -ltr ../data

total 4
drwxr-xr-x 3 itv011204 students 4096 Feb 20 16:48 employees


In [49]:
!hdfs dfs -ls /user/itversity/employees

Found 2 items
-rw-r--r--   3 itversity itversity          0 2022-07-16 02:24 /user/itversity/employees/_SUCCESS
-rw-r--r--   3 itversity itversity       1258 2022-07-16 02:24 /user/itversity/employees/part-00000-49b27c92-f756-4ae8-9791-21181c4f881a-c000.json


In [64]:
!hdfs dfs -put -f /home/itv011204/data/employees /user/`whoami`/

In [65]:
!hdfs dfs -ls /user/itv011204

Found 10 items
drwx------   - itv011204 supergroup          0 2024-01-29 18:47 /user/itv011204/.Trash
drwxr-xr-x   - itv011204 supergroup          0 2024-02-20 17:30 /user/itv011204/.sparkStaging
drwxr-xr-x   - itv011204 supergroup          0 2024-02-14 17:26 /user/itv011204/airtraffic_all
drwxr-xr-x   - itv011204 supergroup          0 2024-01-29 19:33 /user/itv011204/core
drwxr-xr-x   - itv011204 supergroup          0 2024-01-29 08:16 /user/itv011204/df
drwxr-xr-x   - itv011204 supergroup          0 2024-02-20 17:35 /user/itv011204/employees
drwxr-xr-x   - itv011204 supergroup          0 2024-01-31 05:58 /user/itv011204/external
drwxr-xr-x   - itv011204 supergroup          0 2024-01-31 07:26 /user/itv011204/nyse
drwxr-xr-x   - itv011204 supergroup          0 2024-02-18 14:12 /user/itv011204/retail_db
drwxr-xr-x   - itv011204 supergroup          0 2024-02-20 16:49 /user/itv011204/warehouse


In [66]:
!hdfs dfs -ls /user/itv011204/employees

Found 3 items
drwxr-xr-x   - itv011204 supergroup          0 2024-02-20 17:35 /user/itv011204/employees/.ipynb_checkpoints
-rw-r--r--   3 itv011204 supergroup          0 2024-02-20 17:35 /user/itv011204/employees/_SUCCESS
-rw-r--r--   3 itv011204 supergroup       1258 2024-02-20 17:35 /user/itv011204/employees/part-00000-49b27c92-f756-4ae8-9791-21181c4f881a-c000.json


In [68]:
!hdfs dfs -rm -R -skipTrash /user/itv011204/employees/.ipynb_checkpoints

Deleted /user/itv011204/employees/.ipynb_checkpoints


In [69]:
!hdfs dfs -ls /user/itv011204/employees

Found 2 items
-rw-r--r--   3 itv011204 supergroup          0 2024-02-20 17:35 /user/itv011204/employees/_SUCCESS
-rw-r--r--   3 itv011204 supergroup       1258 2024-02-20 17:35 /user/itv011204/employees/part-00000-49b27c92-f756-4ae8-9791-21181c4f881a-c000.json


In [2]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

In [3]:
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
    enableHiveSupport(). \
    appName(f'{username} | Section 19 Semi Structured Data'). \
    master('yarn'). \
    getOrCreate()
    

In [4]:
spark.conf.set('spark.sql.shuffle.partitions','2')

In [5]:
employees_df = spark.read.json(f'/user/{username}/employees')

In [6]:
employees_df.printSchema()

root
 |-- employee_address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- postal_code: long (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- street: string (nullable = true)
 |-- employee_email_ids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- employee_first_name: string (nullable = true)
 |-- employee_id: long (nullable = true)
 |-- employee_last_name: string (nullable = true)
 |-- employee_nationality: string (nullable = true)
 |-- employee_phone_numbers: struct (nullable = true)
 |    |-- Home: string (nullable = true)
 |    |-- Office: string (nullable = true)
 |-- employee_salary: double (nullable = true)
 |-- employee_ssn: string (nullable = true)



In [8]:
employees_df.show(truncate=False)

+--------------------------------------------+---------------------------------------+-------------------+-----------+------------------+--------------------+------------------------------------+---------------+------------+
|employee_address                            |employee_email_ids                     |employee_first_name|employee_id|employee_last_name|employee_nationality|employee_phone_numbers              |employee_salary|employee_ssn|
+--------------------------------------------+---------------------------------------+-------------------+-----------+------------------+--------------------+------------------------------------+---------------+------------+
|{Some City, 500091, Some State, 111 BCD Cir}|[henry@ford.com, hford@companyx.com]   |Henry              |2          |Ford              |India               |{+91 234 567 8901, +91 345 678 9012}|1250.0         |456 78 9123 |
|{UK City, null, UK Province, 222 Giant Cly} |[nick@junior.com, njunior@companyx.com]|Nick          

In [9]:
employees_df.select('employee_email_ids').show(truncate=False)

+---------------------------------------+
|employee_email_ids                     |
+---------------------------------------+
|[henry@ford.com, hford@companyx.com]   |
|[nick@junior.com, njunior@companyx.com]|
|[bill@gomes.com, bgomes@companyx.com]  |
|null                                   |
+---------------------------------------+



In [10]:
employees_df.count()

4

In [11]:
from pyspark.sql.functions import explode

In [12]:
employees_df.select('employee_id','employee_first_name','employee_email_ids').\
    show(truncate=False)

+-----------+-------------------+---------------------------------------+
|employee_id|employee_first_name|employee_email_ids                     |
+-----------+-------------------+---------------------------------------+
|2          |Henry              |[henry@ford.com, hford@companyx.com]   |
|3          |Nick               |[nick@junior.com, njunior@companyx.com]|
|4          |Bill               |[bill@gomes.com, bgomes@companyx.com]  |
|5          |Harry              |null                                   |
+-----------+-------------------+---------------------------------------+



In [13]:
employees_df. \
    select(
        'employee_id',
        'employee_first_name',
        explode('employee_email_ids').alias('employee_email_id')
    ). \
    show(truncate=False)

+-----------+-------------------+--------------------+
|employee_id|employee_first_name|employee_email_id   |
+-----------+-------------------+--------------------+
|2          |Henry              |henry@ford.com      |
|2          |Henry              |hford@companyx.com  |
|3          |Nick               |nick@junior.com     |
|3          |Nick               |njunior@companyx.com|
|4          |Bill               |bill@gomes.com      |
|4          |Bill               |bgomes@companyx.com |
+-----------+-------------------+--------------------+



In [14]:
employees_df.select('employee_id',explode('employee_email_ids')).count()

6

In [15]:
from pyspark.sql.functions import explode_outer

In [16]:
employees_df. \
    select(
        'employee_id',
        'employee_first_name',
        explode_outer('employee_email_ids').alias('employee_email_id')
    ). \
    show(truncate=False)

+-----------+-------------------+--------------------+
|employee_id|employee_first_name|employee_email_id   |
+-----------+-------------------+--------------------+
|2          |Henry              |henry@ford.com      |
|2          |Henry              |hford@companyx.com  |
|3          |Nick               |nick@junior.com     |
|3          |Nick               |njunior@companyx.com|
|4          |Bill               |bill@gomes.com      |
|4          |Bill               |bgomes@companyx.com |
|5          |Harry              |null                |
+-----------+-------------------+--------------------+



In [17]:
employees_df.select('employee_id',explode_outer('employee_email_ids')).count()

7

In [18]:
from pyspark.sql.functions import size

In [19]:
size?

[0;31mSignature:[0m [0msize[0m[0;34m([0m[0mcol[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Collection function: returns the length of the array or map stored in the column.

.. versionadded:: 1.5.0

Parameters
----------
col : :class:`~pyspark.sql.Column` or str
    name of column or expression

Examples
--------
>>> df = spark.createDataFrame([([1, 2, 3],),([1],),([],)], ['data'])
>>> df.select(size(df.data)).collect()
[Row(size(data)=3), Row(size(data)=1), Row(size(data)=0)]
[0;31mFile:[0m      /opt/spark-3.1.2-bin-hadoop3.2/python/pyspark/sql/functions.py
[0;31mType:[0m      function


In [20]:
employees_df. \
    select(
        'employee_id',
        size('employee_email_ids').alias('employee_email_count')
    ). \
    show()

+-----------+--------------------+
|employee_id|employee_email_count|
+-----------+--------------------+
|          2|                   2|
|          3|                   2|
|          4|                   2|
|          5|                  -1|
+-----------+--------------------+



In [23]:
from pyspark.sql import functions as f

In [24]:
f.coalesce?

[0;31mSignature:[0m [0mf[0m[0;34m.[0m[0mcoalesce[0m[0;34m([0m[0;34m*[0m[0mcols[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Returns the first column that is not null.

.. versionadded:: 1.4.0

Examples
--------
>>> cDf = spark.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b"))
>>> cDf.show()
+----+----+
|   a|   b|
+----+----+
|null|null|
|   1|null|
|null|   2|
+----+----+

>>> cDf.select(coalesce(cDf["a"], cDf["b"])).show()
+--------------+
|coalesce(a, b)|
+--------------+
|          null|
|             1|
|             2|
+--------------+

>>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show()
+----+----+----------------+
|   a|   b|coalesce(a, 0.0)|
+----+----+----------------+
|null|null|             0.0|
|   1|null|             1.0|
|null|   2|             0.0|
+----+----+----------------+
[0;31mFile:[0m      /opt/spark-3.1.2-bin-hadoop3.2/python/pyspark/sql/functions.py
[0;31mType:[0m      function


In [25]:
from pyspark.sql.functions import coalesce

In [28]:
employees_df. \
    select(
        'employee_id',
        coalesce('employee_email_ids').alias('employee_email_ids')
    ). \
    show(truncate=False)

+-----------+---------------------------------------+
|employee_id|employee_email_ids                     |
+-----------+---------------------------------------+
|2          |[henry@ford.com, hford@companyx.com]   |
|3          |[nick@junior.com, njunior@companyx.com]|
|4          |[bill@gomes.com, bgomes@companyx.com]  |
|5          |null                                   |
+-----------+---------------------------------------+



In [29]:
from pyspark.sql.functions import array

In [30]:
employees_df. \
    select(
        'employee_id',
        coalesce('employee_email_ids',array()).alias('employee_email_ds')
    ). \
    show(truncate=False)

+-----------+---------------------------------------+
|employee_id|employee_email_ds                      |
+-----------+---------------------------------------+
|2          |[henry@ford.com, hford@companyx.com]   |
|3          |[nick@junior.com, njunior@companyx.com]|
|4          |[bill@gomes.com, bgomes@companyx.com]  |
|5          |[]                                     |
+-----------+---------------------------------------+



In [31]:
employees_df. \
    select(
        'employee_id',
        size(coalesce('employee_email_ids',array())).alias('employee_email_ds')
    ). \
    show(truncate=False)

+-----------+-----------------+
|employee_id|employee_email_ds|
+-----------+-----------------+
|2          |2                |
|3          |2                |
|4          |2                |
|5          |0                |
+-----------+-----------------+



In [42]:
from pyspark.sql.functions import lit

In [54]:
employees_df. \
    select(
        'employee_id',
        size(coalesce('employee_email_ids',array(lit('')))).alias('employee_email_ds')
    ). \
    show(truncate=False)

+-----------+-----------------+
|employee_id|employee_email_ds|
+-----------+-----------------+
|2          |2                |
|3          |2                |
|4          |2                |
|5          |1                |
+-----------+-----------------+



In [40]:
coalesce?

[0;31mSignature:[0m [0mcoalesce[0m[0;34m([0m[0;34m*[0m[0mcols[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Returns the first column that is not null.

.. versionadded:: 1.4.0

Examples
--------
>>> cDf = spark.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b"))
>>> cDf.show()
+----+----+
|   a|   b|
+----+----+
|null|null|
|   1|null|
|null|   2|
+----+----+

>>> cDf.select(coalesce(cDf["a"], cDf["b"])).show()
+--------------+
|coalesce(a, b)|
+--------------+
|          null|
|             1|
|             2|
+--------------+

>>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show()
+----+----+----------------+
|   a|   b|coalesce(a, 0.0)|
+----+----+----------------+
|null|null|             0.0|
|   1|null|             1.0|
|null|   2|             0.0|
+----+----+----------------+
[0;31mFile:[0m      /opt/spark-3.1.2-bin-hadoop3.2/python/pyspark/sql/functions.py
[0;31mType:[0m      function


In [7]:
employees_df. \
    select(
        'employee_id',
        'employee_email_ids'
    ). \
    show(truncate=False)

+-----------+---------------------------------------+
|employee_id|employee_email_ids                     |
+-----------+---------------------------------------+
|2          |[henry@ford.com, hford@companyx.com]   |
|3          |[nick@junior.com, njunior@companyx.com]|
|4          |[bill@gomes.com, bgomes@companyx.com]  |
|5          |null                                   |
+-----------+---------------------------------------+



In [8]:
from pyspark.sql.functions import concat_ws

In [9]:
concat_ws?

[0;31mSignature:[0m [0mconcat_ws[0m[0;34m([0m[0msep[0m[0;34m,[0m [0;34m*[0m[0mcols[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Concatenates multiple input string columns together into a single string column,
using the given separator.

.. versionadded:: 1.5.0

Examples
--------
>>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
>>> df.select(concat_ws('-', df.s, df.d).alias('s')).collect()
[Row(s='abcd-123')]
[0;31mFile:[0m      /opt/spark-3.1.2-bin-hadoop3.2/python/pyspark/sql/functions.py
[0;31mType:[0m      function


In [11]:
employees_df. \
    select(
        'employee_id',
        concat_ws(', ','employee_email_ids')
    ). \
    show(truncate=False)

+-----------+-------------------------------------+
|employee_id|concat_ws(, , employee_email_ids)    |
+-----------+-------------------------------------+
|2          |henry@ford.com, hford@companyx.com   |
|3          |nick@junior.com, njunior@companyx.com|
|4          |bill@gomes.com, bgomes@companyx.com  |
|5          |                                     |
+-----------+-------------------------------------+



In [13]:
employees_df.toJSON().collect()

['{"employee_address":{"city":"Some City","postal_code":500091,"state":"Some State","street":"111 BCD Cir"},"employee_email_ids":["henry@ford.com","hford@companyx.com"],"employee_first_name":"Henry","employee_id":2,"employee_last_name":"Ford","employee_nationality":"India","employee_phone_numbers":{"Home":"+91 234 567 8901","Office":"+91 345 678 9012"},"employee_salary":1250.0,"employee_ssn":"456 78 9123"}',
 '{"employee_address":{"city":"UK City","state":"UK Province","street":"222 Giant Cly"},"employee_email_ids":["nick@junior.com","njunior@companyx.com"],"employee_first_name":"Nick","employee_id":3,"employee_last_name":"Junior","employee_nationality":"United Kingdom","employee_phone_numbers":{"Home":"+44 111 111 1111","Office":"+44 222 222 2222"},"employee_salary":750.0,"employee_ssn":"222 33 4444"}',
 '{"employee_email_ids":["bill@gomes.com","bgomes@companyx.com"],"employee_first_name":"Bill","employee_id":4,"employee_last_name":"Gomes","employee_nationality":"Australia","employee_

In [17]:
employees=[
(2,"Henry","Ford",1250.0,"India","henry@ford.com, hford@companyx.com",{"Home":"+91 234 567 8901","Office":"+91 345 678 9012"},"456 78 9123",("111 BCD Cir","Some City","Some State",500091)),
(3,"Nick","Junior",750.0,"United Kingdom","nick@junior.com, njunior@companyx.com",{"Home":"+44 111 111 1111", "Office":"+44 222 222 2222"},"222 33 4444",("222 Giant Cly","UK City","UK Province",None)),
(4,"Bill","Gomes",1500.0,"Australia","bill@gomes.com, bgomes@companyx.com",{"Home":"+61 987 654 3210","Office":"+61 876 543 2109"},"789 12 6118",None),
(5,"Harry","Potter",1800.0,"United States",None,None,None,None)
]

In [18]:
employees_df = spark.createDataFrame(
    employees,
    schema = """
        employee_id INT, employee_first_name STRING, employee_last_name STRING, 
        employee_salary FLOAT, employee_nationality STRING,
        employee_email_ids STRING,
        employee_phone_numbers MAP<STRING, STRING>,
        employee_ssn STRING,
        employee_address STRUCT<street: STRING, city: STRING, state: STRING, postal_code: INT>
    """
)

In [20]:
employees_df.select('employee_id','employee_email_ids').show(truncate=False)

+-----------+-------------------------------------+
|employee_id|employee_email_ids                   |
+-----------+-------------------------------------+
|2          |henry@ford.com, hford@companyx.com   |
|3          |nick@junior.com, njunior@companyx.com|
|4          |bill@gomes.com, bgomes@companyx.com  |
|5          |null                                 |
+-----------+-------------------------------------+



In [21]:
from pyspark.sql.functions import split

In [30]:
employees_df. \
    select(
        'employee_id',
        split('employee_email_ids',', ').alias('employee_email_ids')
    ). \
    show(truncate=False)

+-----------+---------------------------------------+
|employee_id|employee_email_ids                     |
+-----------+---------------------------------------+
|2          |[henry@ford.com, hford@companyx.com]   |
|3          |[nick@junior.com, njunior@companyx.com]|
|4          |[bill@gomes.com, bgomes@companyx.com]  |
|5          |null                                   |
+-----------+---------------------------------------+



In [31]:
employees_df. \
    select(
        'employee_id',
        split('employee_email_ids',', ').alias('employee_email_ids')
    ). \
    printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- employee_email_ids: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [32]:
from pyspark.sql.functions import explode_outer

In [33]:
employees_df. \
    select(
        'employee_id',
        explode_outer(split('employee_email_ids',', ')).alias('employee_email_ids')
    ). \
    printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- employee_email_ids: string (nullable = true)



In [34]:
employees_df. \
    select(
        'employee_id',
        explode_outer(split('employee_email_ids',', ')).alias('employee_email_ids')
    ). \
    show(truncate=False)

+-----------+--------------------+
|employee_id|employee_email_ids  |
+-----------+--------------------+
|2          |henry@ford.com      |
|2          |hford@companyx.com  |
|3          |nick@junior.com     |
|3          |njunior@companyx.com|
|4          |bill@gomes.com      |
|4          |bgomes@companyx.com |
|5          |null                |
+-----------+--------------------+



In [35]:
explode_outer?

[0;31mSignature:[0m [0mexplode_outer[0m[0;34m([0m[0mcol[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Returns a new row for each element in the given array or map.
Unlike explode, if the array/map is null or empty then null is produced.
Uses the default column name `col` for elements in the array and
`key` and `value` for elements in the map unless specified otherwise.

.. versionadded:: 2.3.0

Examples
--------
>>> df = spark.createDataFrame(
...     [(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)],
...     ("id", "an_array", "a_map")
... )
>>> df.select("id", "an_array", explode_outer("a_map")).show()
+---+----------+----+-----+
| id|  an_array| key|value|
+---+----------+----+-----+
|  1|[foo, bar]|   x|  1.0|
|  2|        []|null| null|
|  3|      null|null| null|
+---+----------+----+-----+

>>> df.select("id", "a_map", explode_outer("an_array")).show()
+---+----------+----+
| id|     a_map| col|
+---+----------+----+
|  1|{x -> 1.0}| foo|
|

In [1]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

In [2]:
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
    enableHiveSupport(). \
    appName(f'{username} | Section 19 Semi Structured Data'). \
    master('yarn'). \
    getOrCreate()
    

In [3]:
spark.conf.set('spark.sql.shuffle.partitions','2')

In [5]:
!ls -ltr ../data/employee_emails

total 4
-rw-r--r-- 1 itv011204 students   0 Feb 21 04:59 _SUCCESS
-rw-r--r-- 1 itv011204 students 774 Feb 21 04:59 part-00000-dfce426e-b1c1-4059-8979-e6e224f2fc19-c000.json


In [7]:
!cat ../data/employee_emails/part-00000-dfce426e-b1c1-4059-8979-e6e224f2fc19-c000.json

{"employee_id":2,"employee_first_name":"Henry","employee_last_name":"Ford","employee_email_id":"henry@ford.com"}
{"employee_id":2,"employee_first_name":"Henry","employee_last_name":"Ford","employee_email_id":"hford@companyx.com"}
{"employee_id":3,"employee_first_name":"Nick","employee_last_name":"Junior","employee_email_id":"nick@junior.com"}
{"employee_id":3,"employee_first_name":"Nick","employee_last_name":"Junior","employee_email_id":"njunior@companyx.com"}
{"employee_id":4,"employee_first_name":"Bill","employee_last_name":"Gomes","employee_email_id":"bill@gomes.com"}
{"employee_id":4,"employee_first_name":"Bill","employee_last_name":"Gomes","employee_email_id":"bgomes@companyx.com"}
{"employee_id":5,"employee_first_name":"Harry","employee_last_name":"Potter"}


In [8]:
!hdfs dfs -put -f ../data/employee_emails /user/`whoami`/

In [9]:
!hdfs dfs -ls /user/`whoami`/employee_emails

Found 2 items
-rw-r--r--   3 itv011204 supergroup          0 2024-02-21 05:04 /user/itv011204/employee_emails/_SUCCESS
-rw-r--r--   3 itv011204 supergroup        774 2024-02-21 05:04 /user/itv011204/employee_emails/part-00000-dfce426e-b1c1-4059-8979-e6e224f2fc19-c000.json


In [10]:
!hdfs dfs -rm -R -skipTrash /user/`whoami`/employee_emails/.ipynb_checkpoints

rm: `/user/itv011204/employee_emails/.ipynb_checkpoints': No such file or directory


In [3]:
employee_emails_df = spark. \
                        read. \
                        json(f'/user/{username}/employee_emails')

In [5]:
employee_emails_df.printSchema()

root
 |-- employee_email_id: string (nullable = true)
 |-- employee_first_name: string (nullable = true)
 |-- employee_id: long (nullable = true)
 |-- employee_last_name: string (nullable = true)



In [6]:
employee_emails_df.show(truncate=False)

+--------------------+-------------------+-----------+------------------+
|employee_email_id   |employee_first_name|employee_id|employee_last_name|
+--------------------+-------------------+-----------+------------------+
|henry@ford.com      |Henry              |2          |Ford              |
|hford@companyx.com  |Henry              |2          |Ford              |
|nick@junior.com     |Nick               |3          |Junior            |
|njunior@companyx.com|Nick               |3          |Junior            |
|bill@gomes.com      |Bill               |4          |Gomes             |
|bgomes@companyx.com |Bill               |4          |Gomes             |
|null                |Harry              |5          |Potter            |
+--------------------+-------------------+-----------+------------------+



In [7]:
from pyspark.sql.functions import count

In [10]:
employee_emails_df. \
    groupBy('employee_id','employee_first_name','employee_last_name'). \
    agg(count('*').alias('employee_count')). \
    show()

+-----------+-------------------+------------------+--------------+
|employee_id|employee_first_name|employee_last_name|employee_count|
+-----------+-------------------+------------------+--------------+
|          3|               Nick|            Junior|             2|
|          4|               Bill|             Gomes|             2|
|          5|              Harry|            Potter|             1|
|          2|              Henry|              Ford|             2|
+-----------+-------------------+------------------+--------------+



In [14]:
from pyspark.sql.functions import lit

In [15]:
employee_emails_df. \
    groupBy('employee_id','employee_first_name','employee_last_name'). \
    agg(count(lit('1')).alias('employee_count')). \
    show()

+-----------+-------------------+------------------+--------------+
|employee_id|employee_first_name|employee_last_name|employee_count|
+-----------+-------------------+------------------+--------------+
|          3|               Nick|            Junior|             2|
|          4|               Bill|             Gomes|             2|
|          5|              Harry|            Potter|             1|
|          2|              Henry|              Ford|             2|
+-----------+-------------------+------------------+--------------+



In [16]:
employee_emails_df. \
    groupBy('employee_id','employee_first_name','employee_last_name'). \
    agg(count('employee_email_id').alias('employee_count')). \
    show()

+-----------+-------------------+------------------+--------------+
|employee_id|employee_first_name|employee_last_name|employee_count|
+-----------+-------------------+------------------+--------------+
|          3|               Nick|            Junior|             2|
|          4|               Bill|             Gomes|             2|
|          5|              Harry|            Potter|             0|
|          2|              Henry|              Ford|             2|
+-----------+-------------------+------------------+--------------+



In [18]:
from pyspark.sql.functions import collect_set, count, lit

In [21]:
employee_emails_df. \
    groupBy('employee_id','employee_first_name','employee_last_name'). \
    agg(
        count('employee_email_id').alias('employee_count'), 
        collect_set('employee_email_id').alias('employee_email_ids')
    ). \
    show(truncate=False)

+-----------+-------------------+------------------+--------------+---------------------------------------+
|employee_id|employee_first_name|employee_last_name|employee_count|employee_email_ids                     |
+-----------+-------------------+------------------+--------------+---------------------------------------+
|3          |Nick               |Junior            |2             |[njunior@companyx.com, nick@junior.com]|
|4          |Bill               |Gomes             |2             |[bill@gomes.com, bgomes@companyx.com]  |
|5          |Harry              |Potter            |0             |[]                                     |
|2          |Henry              |Ford              |2             |[henry@ford.com, hford@companyx.com]   |
+-----------+-------------------+------------------+--------------+---------------------------------------+



In [1]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

In [2]:
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
    enableHiveSupport(). \
    appName(f'{username} | Section 19 Semi Structured Data'). \
    master('yarn'). \
    getOrCreate()
    

In [3]:
spark.conf.set('spark.sql.shuffle.partitions','2')

In [4]:
!cat ../data/employees/part-00000-49b27c92-f756-4ae8-9791-21181c4f881a-c000.json

{"employee_id":2,"employee_first_name":"Henry","employee_last_name":"Ford","employee_salary":1250.0,"employee_nationality":"India","employee_email_ids":["henry@ford.com","hford@companyx.com"],"employee_phone_numbers":{"Office":"+91 345 678 9012","Home":"+91 234 567 8901"},"employee_ssn":"456 78 9123","employee_address":{"street":"111 BCD Cir","city":"Some City","state":"Some State","postal_code":500091}}
{"employee_id":3,"employee_first_name":"Nick","employee_last_name":"Junior","employee_salary":750.0,"employee_nationality":"United Kingdom","employee_email_ids":["nick@junior.com","njunior@companyx.com"],"employee_phone_numbers":{"Office":"+44 222 222 2222","Home":"+44 111 111 1111"},"employee_ssn":"222 33 4444","employee_address":{"street":"222 Giant Cly","city":"UK City","state":"UK Province"}}
{"employee_id":4,"employee_first_name":"Bill","employee_last_name":"Gomes","employee_salary":1500.0,"employee_nationality":"Australia","employee_email_ids":["bill@gomes.com","bgomes@companyx.c

In [5]:
employees_df = spark.read.json(f'/user/{username}/employees')

In [6]:
employees_df.printSchema()

root
 |-- employee_address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- postal_code: long (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- street: string (nullable = true)
 |-- employee_email_ids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- employee_first_name: string (nullable = true)
 |-- employee_id: long (nullable = true)
 |-- employee_last_name: string (nullable = true)
 |-- employee_nationality: string (nullable = true)
 |-- employee_phone_numbers: struct (nullable = true)
 |    |-- Home: string (nullable = true)
 |    |-- Office: string (nullable = true)
 |-- employee_salary: double (nullable = true)
 |-- employee_ssn: string (nullable = true)



In [7]:
employees_df. \
    select('employee_id','employee_address'). \
    show(truncate=False)

+-----------+--------------------------------------------+
|employee_id|employee_address                            |
+-----------+--------------------------------------------+
|2          |{Some City, 500091, Some State, 111 BCD Cir}|
|3          |{UK City, null, UK Province, 222 Giant Cly} |
|4          |null                                        |
|5          |null                                        |
+-----------+--------------------------------------------+



In [8]:
employees_df. \
    select('employee_id','employee_address'). \
    printSchema()

root
 |-- employee_id: long (nullable = true)
 |-- employee_address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- postal_code: long (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- street: string (nullable = true)



In [9]:
employees_df. \
    select('employee_id','employee_address.*'). \
    printSchema()

root
 |-- employee_id: long (nullable = true)
 |-- city: string (nullable = true)
 |-- postal_code: long (nullable = true)
 |-- state: string (nullable = true)
 |-- street: string (nullable = true)



In [10]:
employees_df. \
    select('employee_id','employee_address.*'). \
    show(truncate=False)

+-----------+---------+-----------+-----------+-------------+
|employee_id|city     |postal_code|state      |street       |
+-----------+---------+-----------+-----------+-------------+
|2          |Some City|500091     |Some State |111 BCD Cir  |
|3          |UK City  |null       |UK Province|222 Giant Cly|
|4          |null     |null       |null       |null         |
|5          |null     |null       |null       |null         |
+-----------+---------+-----------+-----------+-------------+



In [11]:
employees_df. \
    select('employee_id','employee_address.city'). \
    show(truncate=False)

+-----------+---------+
|employee_id|city     |
+-----------+---------+
|2          |Some City|
|3          |UK City  |
|4          |null     |
|5          |null     |
+-----------+---------+



In [12]:
employees_df. \
    select('employee_id','employee_address.city','employee_address.postal_code'). \
    show(truncate=False)

+-----------+---------+-----------+
|employee_id|city     |postal_code|
+-----------+---------+-----------+
|2          |Some City|500091     |
|3          |UK City  |null       |
|4          |null     |null       |
|5          |null     |null       |
+-----------+---------+-----------+



In [13]:
from pyspark.sql.functions import concat_ws

In [14]:
concat_ws?

[0;31mSignature:[0m [0mconcat_ws[0m[0;34m([0m[0msep[0m[0;34m,[0m [0;34m*[0m[0mcols[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Concatenates multiple input string columns together into a single string column,
using the given separator.

.. versionadded:: 1.5.0

Examples
--------
>>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
>>> df.select(concat_ws('-', df.s, df.d).alias('s')).collect()
[Row(s='abcd-123')]
[0;31mFile:[0m      /opt/spark-3.1.2-bin-hadoop3.2/python/pyspark/sql/functions.py
[0;31mType:[0m      function


In [15]:
employees_df. \
    select('employee_id',
        concat_ws(', ','employee_address.street','employee_address.city').alias('employee_address')
    ). \
    show(truncate=False)

+-----------+----------------------+
|employee_id|employee_address      |
+-----------+----------------------+
|2          |111 BCD Cir, Some City|
|3          |222 Giant Cly, UK City|
|4          |                      |
|5          |                      |
+-----------+----------------------+



In [16]:
employees_df. \
    select('employee_id',
        concat_ws(', ','employee_address.street','employee_address.city','employee_address.postal_code').alias('employee_address')
    ). \
    show(truncate=False)

+-----------+------------------------------+
|employee_id|employee_address              |
+-----------+------------------------------+
|2          |111 BCD Cir, Some City, 500091|
|3          |222 Giant Cly, UK City        |
|4          |                              |
|5          |                              |
+-----------+------------------------------+



In [17]:
employees_df. \
    select('employee_id','employee_address'). \
    show(truncate=False)

+-----------+--------------------------------------------+
|employee_id|employee_address                            |
+-----------+--------------------------------------------+
|2          |{Some City, 500091, Some State, 111 BCD Cir}|
|3          |{UK City, null, UK Province, 222 Giant Cly} |
|4          |null                                        |
|5          |null                                        |
+-----------+--------------------------------------------+



In [18]:
employees_df. \
    filter('employee_address.postal_code IS NOT NULL'). \
    show(truncate=False)

+--------------------------------------------+------------------------------------+-------------------+-----------+------------------+--------------------+------------------------------------+---------------+------------+
|employee_address                            |employee_email_ids                  |employee_first_name|employee_id|employee_last_name|employee_nationality|employee_phone_numbers              |employee_salary|employee_ssn|
+--------------------------------------------+------------------------------------+-------------------+-----------+------------------+--------------------+------------------------------------+---------------+------------+
|{Some City, 500091, Some State, 111 BCD Cir}|[henry@ford.com, hford@companyx.com]|Henry              |2          |Ford              |India               |{+91 234 567 8901, +91 345 678 9012}|1250.0         |456 78 9123 |
+--------------------------------------------+------------------------------------+-------------------+---------

In [42]:
from pyspark.sql.functions import lit,col

In [43]:
employees_df. \
    select('employee_id','employee_first_name','employee_address.*'). \
    show(truncate=False)

+-----------+-------------------+---------+-----------+-----------+-------------+
|employee_id|employee_first_name|city     |postal_code|state      |street       |
+-----------+-------------------+---------+-----------+-----------+-------------+
|2          |Henry              |Some City|500091     |Some State |111 BCD Cir  |
|3          |Nick               |UK City  |null       |UK Province|222 Giant Cly|
|4          |Bill               |null     |null       |null       |null         |
|5          |Harry              |null     |null       |null       |null         |
+-----------+-------------------+---------+-----------+-----------+-------------+



In [45]:
employees_df. \
    filter(col('employee_id')==lit(2)). \
    select('employee_id','employee_first_name','employee_address.*'). \
    show(truncate=False)

+-----------+-------------------+---------+-----------+----------+-----------+
|employee_id|employee_first_name|city     |postal_code|state     |street     |
+-----------+-------------------+---------+-----------+----------+-----------+
|2          |Henry              |Some City|500091     |Some State|111 BCD Cir|
+-----------+-------------------+---------+-----------+----------+-----------+



In [50]:
employees_df. \
    filter('employee_address.postal_code IS NOT NULL'). \
    select('employee_id','employee_first_name','employee_address.*'). \
    show(truncate=False)

+-----------+-------------------+---------+-----------+----------+-----------+
|employee_id|employee_first_name|city     |postal_code|state     |street     |
+-----------+-------------------+---------+-----------+----------+-----------+
|2          |Henry              |Some City|500091     |Some State|111 BCD Cir|
+-----------+-------------------+---------+-----------+----------+-----------+



In [52]:
employees_df. \
    filter('employee_address.postal_code IS NOT NULL AND employee_first_name="Henry"'). \
    select('employee_id','employee_first_name','employee_address.*'). \
    show(truncate=False)

+-----------+-------------------+---------+-----------+----------+-----------+
|employee_id|employee_first_name|city     |postal_code|state     |street     |
+-----------+-------------------+---------+-----------+----------+-----------+
|2          |Henry              |Some City|500091     |Some State|111 BCD Cir|
+-----------+-------------------+---------+-----------+----------+-----------+



In [56]:
employees_df. \
    filter((col('employee_address.postal_code').isNull()) & (col('employee_first_name')=="Henry")). \
    select('employee_id','employee_first_name','employee_address.*'). \
    show(truncate=False)

+-----------+-------------------+----+-----------+-----+------+
|employee_id|employee_first_name|city|postal_code|state|street|
+-----------+-------------------+----+-----------+-----+------+
+-----------+-------------------+----+-----------+-----+------+



In [57]:
employees_df. \
    filter((~(col('employee_address.postal_code').isNull())) & (col('employee_first_name')=="Henry")). \
    select('employee_id','employee_first_name','employee_address.*'). \
    show(truncate=False)

+-----------+-------------------+---------+-----------+----------+-----------+
|employee_id|employee_first_name|city     |postal_code|state     |street     |
+-----------+-------------------+---------+-----------+----------+-----------+
|2          |Henry              |Some City|500091     |Some State|111 BCD Cir|
+-----------+-------------------+---------+-----------+----------+-----------+



In [59]:
employees_df. \
    filter(~(employees_df.employee_address.postal_code.isNull()) & (employees_df.employee_first_name=="Henry")). \
    select('employee_id','employee_first_name','employee_address.*'). \
    show(truncate=False)

+-----------+-------------------+---------+-----------+----------+-----------+
|employee_id|employee_first_name|city     |postal_code|state     |street     |
+-----------+-------------------+---------+-----------+----------+-----------+
|2          |Henry              |Some City|500091     |Some State|111 BCD Cir|
+-----------+-------------------+---------+-----------+----------+-----------+



In [1]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

In [2]:
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
    enableHiveSupport(). \
    appName(f'{username} | Section 19 Semi Structured Data'). \
    master('yarn'). \
    getOrCreate()
    

In [3]:
spark.conf.set('spark.sql.shuffle.partitions','2')

In [4]:
!cat ../data/employees/part-00000-49b27c92-f756-4ae8-9791-21181c4f881a-c000.json

{"employee_id":2,"employee_first_name":"Henry","employee_last_name":"Ford","employee_salary":1250.0,"employee_nationality":"India","employee_email_ids":["henry@ford.com","hford@companyx.com"],"employee_phone_numbers":{"Office":"+91 345 678 9012","Home":"+91 234 567 8901"},"employee_ssn":"456 78 9123","employee_address":{"street":"111 BCD Cir","city":"Some City","state":"Some State","postal_code":500091}}
{"employee_id":3,"employee_first_name":"Nick","employee_last_name":"Junior","employee_salary":750.0,"employee_nationality":"United Kingdom","employee_email_ids":["nick@junior.com","njunior@companyx.com"],"employee_phone_numbers":{"Office":"+44 222 222 2222","Home":"+44 111 111 1111"},"employee_ssn":"222 33 4444","employee_address":{"street":"222 Giant Cly","city":"UK City","state":"UK Province"}}
{"employee_id":4,"employee_first_name":"Bill","employee_last_name":"Gomes","employee_salary":1500.0,"employee_nationality":"Australia","employee_email_ids":["bill@gomes.com","bgomes@companyx.c

In [3]:
employees_df = spark.read.json(f'/user/{username}/employees')

In [5]:
employees_df.printSchema()

root
 |-- employee_address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- postal_code: long (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- street: string (nullable = true)
 |-- employee_email_ids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- employee_first_name: string (nullable = true)
 |-- employee_id: long (nullable = true)
 |-- employee_last_name: string (nullable = true)
 |-- employee_nationality: string (nullable = true)
 |-- employee_phone_numbers: struct (nullable = true)
 |    |-- Home: string (nullable = true)
 |    |-- Office: string (nullable = true)
 |-- employee_salary: double (nullable = true)
 |-- employee_ssn: string (nullable = true)



In [6]:
from pyspark.sql.functions import map_keys, map_values

In [7]:
employees_df. \
    select(
        'employee_id','employee_phone_numbers'
    ). \
    show(truncate=False)

+-----------+------------------------------------+
|employee_id|employee_phone_numbers              |
+-----------+------------------------------------+
|2          |{+91 234 567 8901, +91 345 678 9012}|
|3          |{+44 111 111 1111, +44 222 222 2222}|
|4          |{+61 987 654 3210, +61 876 543 2109}|
|5          |null                                |
+-----------+------------------------------------+



In [8]:
employees_df. \
    select(
        'employee_id',map_keys('employee_phone_numbers').alias('employee_phone_numbers_keys')
    ). \
    show(truncate=False)

AnalysisException: cannot resolve 'map_keys(`employee_phone_numbers`)' due to data type mismatch: argument 1 requires map type, however, '`employee_phone_numbers`' is of struct<Home:string,Office:string> type.;
'Project [employee_id#10L, map_keys(employee_phone_numbers#13) AS employee_phone_numbers_keys#38]
+- Relation[employee_address#7,employee_email_ids#8,employee_first_name#9,employee_id#10L,employee_last_name#11,employee_nationality#12,employee_phone_numbers#13,employee_salary#14,employee_ssn#15] json


In [9]:
employees_df = spark. \
                read. \
                json(
                    f'/user/{username}/employees',
                    schema = """
                        employee_id INT, employee_first_name STRING, employee_last_name STRING, 
                        employee_salary FLOAT, employee_nationality STRING,
                        employee_email_ids ARRAY<STRING>,
                        employee_phone_numbers MAP<STRING, STRING>,
                        employee_ssn STRING,
                        employee_address STRUCT<street: STRING, city: STRING, state: STRING, postal_code: INT>
                    """
                )

In [10]:
employees_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- employee_first_name: string (nullable = true)
 |-- employee_last_name: string (nullable = true)
 |-- employee_salary: float (nullable = true)
 |-- employee_nationality: string (nullable = true)
 |-- employee_email_ids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- employee_phone_numbers: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- employee_ssn: string (nullable = true)
 |-- employee_address: struct (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- postal_code: integer (nullable = true)



In [11]:
employees_df. \
    select(
        'employee_id','employee_phone_numbers'
    ). \
    show(truncate=False)

+-----------+------------------------------------------------------+
|employee_id|employee_phone_numbers                                |
+-----------+------------------------------------------------------+
|2          |{Office -> +91 345 678 9012, Home -> +91 234 567 8901}|
|3          |{Office -> +44 222 222 2222, Home -> +44 111 111 1111}|
|4          |{Office -> +61 876 543 2109, Home -> +61 987 654 3210}|
|5          |null                                                  |
+-----------+------------------------------------------------------+



In [12]:
from pyspark.sql.functions import map_keys, map_values

In [13]:
employees_df. \
    select(
        'employee_id', map_keys('employee_phone_numbers').alias('employee_phone_number_keys')
    ). \
    show()

+-----------+--------------------------+
|employee_id|employee_phone_number_keys|
+-----------+--------------------------+
|          2|            [Office, Home]|
|          3|            [Office, Home]|
|          4|            [Office, Home]|
|          5|                      null|
+-----------+--------------------------+



In [14]:
employees_df. \
    select(
        'employee_id', map_values('employee_phone_numbers').alias('employee_phone_number_values')
    ). \
    show()

+-----------+----------------------------+
|employee_id|employee_phone_number_values|
+-----------+----------------------------+
|          2|        [+91 345 678 9012...|
|          3|        [+44 222 222 2222...|
|          4|        [+61 876 543 2109...|
|          5|                        null|
+-----------+----------------------------+



In [16]:
employees_df. \
    select(
        'employee_id', 'employee_phone_numbers.Home'
    ). \
    show()

+-----------+----------------+
|employee_id|            Home|
+-----------+----------------+
|          2|+91 234 567 8901|
|          3|+44 111 111 1111|
|          4|+61 987 654 3210|
|          5|            null|
+-----------+----------------+



In [17]:
employees_df. \
    select(
        'employee_id', 'employee_phone_numbers.Office'
    ). \
    show()

+-----------+----------------+
|employee_id|          Office|
+-----------+----------------+
|          2|+91 345 678 9012|
|          3|+44 222 222 2222|
|          4|+61 876 543 2109|
|          5|            null|
+-----------+----------------+



In [18]:
employees_df. \
    select(
        'employee_id', 'employee_phone_numbers.*'
    ). \
    show()

AnalysisException: Can only star expand struct data types. Attribute: `ArrayBuffer(employee_phone_numbers)`

In [19]:
from pyspark.sql.functions import col

In [20]:
employees_df. \
    select(
        'employee_id', 
        col('employee_phone_numbers.Home').alias('home_phone_number'), 
        col('employee_phone_numbers.Office').alias('office_phone_number')
    ). \
    show()

+-----------+-----------------+-------------------+
|employee_id|home_phone_number|office_phone_number|
+-----------+-----------------+-------------------+
|          2| +91 234 567 8901|   +91 345 678 9012|
|          3| +44 111 111 1111|   +44 222 222 2222|
|          4| +61 987 654 3210|   +61 876 543 2109|
|          5|             null|               null|
+-----------+-----------------+-------------------+

