## Section 18 Exploring Spark Catalog

In [None]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config('spark.driver.memory','6g'). \
        config('spark.executor.memory','6g'). \
        config('spark.dynamicAllocation.minExecutors', '4'). \
        enableHiveSupport(). \
        appName(f'{username} | Section 18 Spark Metastore'). \
        master('yarn'). \
        getOrCreate()

In [2]:
spark.conf.set("spark.sql.shuffle.partitions","2")

In [3]:
import getpass

username = getpass.getuser()

In [4]:
username

'itv011204'

In [9]:
spark.catalog

<pyspark.sql.catalog.Catalog at 0x7f7bacd05320>

In [10]:
spark.catalog?

[0;31mType:[0m        property
[0;31mString form:[0m <property object at 0x7f7bacb1b9a8>
[0;31mDocstring:[0m  
Interface through which the user may create, drop, alter or query underlying
databases, tables, functions, etc.

.. versionadded:: 2.0.0

Returns
-------
:class:`Catalog`


In [11]:
help(spark.catalog)

Help on Catalog in module pyspark.sql.catalog object:

class Catalog(builtins.object)
 |  User-facing catalog API, accessible through `SparkSession.catalog`.
 |  
 |  This is a thin wrapper around its Scala implementation org.apache.spark.sql.catalog.Catalog.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, sparkSession)
 |      Create a new Catalog that wraps the underlying JVM object.
 |  
 |  cacheTable(self, tableName)
 |      Caches the specified table in-memory.
 |      
 |      .. versionadded:: 2.0
 |  
 |  clearCache(self)
 |      Removes all cached tables from the in-memory cache.
 |      
 |      .. versionadded:: 2.0
 |  
 |  createExternalTable(self, tableName, path=None, source=None, schema=None, **options)
 |      Creates a table based on the dataset in a data source.
 |      
 |      It returns the DataFrame associated with the external table.
 |      
 |      The data source is specified by the ``source`` and a set of ``options``.
 |      If ``source`` is not spe

In [12]:
spark.sql(f"DROP DATABASE IF EXISTS {username}_demo_db CASCADE")

In [13]:
spark.sql(f"CREATE DATABASE {username}_demo_db")

In [14]:
spark.catalog.setCurrentDatabase(f"{username}_demo_db")

In [22]:
spark.catalog.currentDatabase()

'itv011204_demo_db'

In [15]:
spark.catalog.listTables()

[]

In [18]:
l = [("X",)]

In [19]:
df = spark.createDataFrame(l,schema="dummy STRING")

In [21]:
df.show()

+-----+
|dummy|
+-----+
|    X|
+-----+



In [23]:
df.write.saveAsTable?

[0;31mSignature:[0m
[0mdf[0m[0;34m.[0m[0mwrite[0m[0;34m.[0m[0msaveAsTable[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mformat[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmode[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpartitionBy[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0moptions[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Saves the content of the :class:`DataFrame` as the specified table.

In the case the table already exists, behavior of this function depends on the
save mode, specified by the `mode` function (default to throwing an exception).
When `mode` is `Overwrite`, the schema of the :class:`DataFrame` does not need to be
the same as that of the existing table.

* `append`: Append contents of this :class:`DataFrame` to existing data.
* `overwrite`: Overwrite exi

In [24]:
df.write.saveAsTable("dual",mode='overwrite')

In [25]:
df.write.saveAsTable("dual",mode='append')

In [26]:
spark.read.table("dual").show()

+-----+
|dummy|
+-----+
|    X|
|    X|
+-----+



In [28]:
spark.sql("select * from dual").show()

+-----+
|dummy|
+-----+
|    X|
|    X|
+-----+



In [29]:
spark.sql("DROP TABLE dual")

In [31]:
schema = df.schema

In [32]:
schema

StructType(List(StructField(dummy,StringType,true)))

In [33]:
spark.catalog.createTable?

[0;31mSignature:[0m
[0mspark[0m[0;34m.[0m[0mcatalog[0m[0;34m.[0m[0mcreateTable[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtableName[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpath[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msource[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mschema[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdescription[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0moptions[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Creates a table based on the dataset in a data source.

It returns the DataFrame associated with the table.

The data source is specified by the ``source`` and a set of ``options``.
If ``source`` is not specified, the default data source configured by
``spark.sql.sources.default`` will be used. When ``path`` is specified, an external table is
created from the dat

In [35]:
spark.catalog.createTable("dual", schema=schema)

dummy


In [36]:
spark.catalog.listTables()

[Table(name='dual', database='itv011204_demo_db', description=None, tableType='MANAGED', isTemporary=False)]

In [37]:
spark.write?

Object `spark.write` not found.


In [39]:
df.write.insertInto?

[0;31mSignature:[0m [0mdf[0m[0;34m.[0m[0mwrite[0m[0;34m.[0m[0minsertInto[0m[0;34m([0m[0mtableName[0m[0;34m,[0m [0moverwrite[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Inserts the content of the :class:`DataFrame` to the specified table.

It requires that the schema of the :class:`DataFrame` is the same as the
schema of the table.

Optionally overwriting any existing data.

.. versionadded:: 1.4
[0;31mFile:[0m      /opt/spark-3.1.2-bin-hadoop3.2/python/pyspark/sql/readwriter.py
[0;31mType:[0m      method


In [40]:
df.write.insertInto("dual")

In [41]:
spark.read.table("dual").show()

+-----+
|dummy|
+-----+
|    X|
+-----+



In [42]:
df.write.insertInto("dual")

In [43]:
df.write.insertInto("dual")

In [44]:
spark.read.table("dual")

dummy
X
X
X


In [45]:
spark.sql("SELECT * FROM dual").show()

+-----+
|dummy|
+-----+
|    X|
|    X|
|    X|
+-----+



In [46]:
df.write.saveAsTable("dual",mode='append')

In [47]:
spark.sql("SELECT * FROM dual").show()

+-----+
|dummy|
+-----+
|    X|
|    X|
|    X|
|    X|
+-----+



In [48]:
spark.read.table("dual")

dummy
X
X
X
X


###  217 Inferring Schema While creating Spark Metastore Tables using Spark Catalog

In [1]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config('spark.driver.memory','6g'). \
        config('spark.executor.memory','6g'). \
        config('spark.dynamicAllocation.minExecutors', '4'). \
        enableHiveSupport(). \
        appName(f'{username} | Section 18 Spark Metastore'). \
        master('yarn'). \
        getOrCreate()

In [2]:
spark.conf.set("spark.sql.shuffle.partitions","2")

In [3]:
import getpass

username = getpass.getuser()

In [4]:
username

'itv011204'

In [5]:
spark.catalog.currentDatabase()

'default'

In [6]:
spark.sql(f"DROP DATABASE IF EXISTS {username}_airtraffic CASCADE")

In [7]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {username}_airtraffic")

In [8]:
spark.catalog.setCurrentDatabase(f"{username}_airtraffic")

In [9]:
spark.catalog.currentDatabase()

'itv011204_airtraffic'

In [65]:
%%sh

hdfs dfs -mkdir /user/`whoami`/airtraffic_all
hdfs dfs -cp -f /public/airlines_all/airport-codes /user/`whoami`/airtraffic_all
hdfs dfs -ls /user/`whoami`/airtraffic_all

Found 1 items
drwxr-xr-x   - itv011204 supergroup          0 2024-02-14 17:26 /user/itv011204/airtraffic_all/airport-codes


In [10]:
!hdfs dfs -ls /user/`whoami`/airtraffic_all/airport-codes

Found 1 items
-rw-r--r--   3 itv011204 supergroup      11411 2024-02-14 17:26 /user/itv011204/airtraffic_all/airport-codes/airport-codes-na.txt


In [11]:
!hdfs dfs -cat /user/itv011204/airtraffic_all/airport-codes/airport-codes-na.txt | tail

Yuma	AZ	USA	YUM	Canada	YZFLa	YWKCanada	YQYada	YZP

In [12]:
spark.catalog.createExternalTable?

[0;31mSignature:[0m
[0mspark[0m[0;34m.[0m[0mcatalog[0m[0;34m.[0m[0mcreateExternalTable[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtableName[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpath[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msource[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mschema[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0moptions[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Creates a table based on the dataset in a data source.

It returns the DataFrame associated with the external table.

The data source is specified by the ``source`` and a set of ``options``.
If ``source`` is not specified, the default data source configured by
``spark.sql.sources.default`` will be used.

Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and
created external table.

.. versionadded:: 2.0

In [13]:
airport_codes_path = f"/user/{username}/airtraffic_all/airport-codes/"

In [14]:
airport_codes_path

'/user/itv011204/airtraffic_all/airport-codes/'

In [15]:
spark.catalog.currentDatabase()

'itv011204_airtraffic'

In [16]:
spark.sql("DROP TABLE IF EXISTS airport_codes")

In [17]:
spark.catalog.createExternalTable(
    "airport_codes",
    path = airport_codes_path,
    source = 'CSV',
    sep = '\t',
    header="true",
    inferSchema = "true"
)

City,State,Country,IATA
Abbotsford,BC,Canada,YXX
Aberdeen,SD,USA,ABR
Abilene,TX,USA,ABI
Akron,OH,USA,CAK
Alamosa,CO,USA,ALS
Albany,GA,USA,ABY
Albany,NY,USA,ALB
Albuquerque,NM,USA,ABQ
Alexandria,LA,USA,AEX
Allentown,PA,USA,ABE


In [18]:
spark.catalog.listTables

<bound method Catalog.listTables of <pyspark.sql.catalog.Catalog object at 0x7f789dd97400>>

In [19]:
spark.catalog.listTables()

[Table(name='airport_codes', database='itv011204_airtraffic', description=None, tableType='EXTERNAL', isTemporary=False)]

In [21]:
spark.sql("DESCRIBE TABLE airport_codes").show()

+--------+---------+-------+
|col_name|data_type|comment|
+--------+---------+-------+
|    City|   string|   null|
|   State|   string|   null|
| Country|   string|   null|
|    IATA|   string|   null|
+--------+---------+-------+



In [22]:
spark.sql("DESCRIBE airport_codes").show()

+--------+---------+-------+
|col_name|data_type|comment|
+--------+---------+-------+
|    City|   string|   null|
|   State|   string|   null|
| Country|   string|   null|
|    IATA|   string|   null|
+--------+---------+-------+



In [23]:
spark.sql("DESCRIBE EXTENDED airport_codes").show(200,truncate=False)

+----------------------------+-------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                |comment|
+----------------------------+-------------------------------------------------------------------------+-------+
|City                        |string                                                                   |null   |
|State                       |string                                                                   |null   |
|Country                     |string                                                                   |null   |
|IATA                        |string                                                                   |null   |
|                            |                                                                         |       |
|# Detailed Table Information|                                                                  

In [24]:
spark.sql("DESCRIBE FORMATTED airport_codes").show(200,truncate=False)

+----------------------------+-------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                |comment|
+----------------------------+-------------------------------------------------------------------------+-------+
|City                        |string                                                                   |null   |
|State                       |string                                                                   |null   |
|Country                     |string                                                                   |null   |
|IATA                        |string                                                                   |null   |
|                            |                                                                         |       |
|# Detailed Table Information|                                                                  

In [25]:
spark.read.table("airport_codes").show(5)

+----------+-----+-------+----+
|      City|State|Country|IATA|
+----------+-----+-------+----+
|Abbotsford|   BC| Canada| YXX|
|  Aberdeen|   SD|    USA| ABR|
|   Abilene|   TX|    USA| ABI|
|     Akron|   OH|    USA| CAK|
|   Alamosa|   CO|    USA| ALS|
+----------+-----+-------+----+
only showing top 5 rows



In [26]:
spark.read.table("airport_codes").count()

526

In [27]:
spark.catalog.listColumns('airport_codes')

[Column(name='City', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='State', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='Country', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='IATA', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False)]

### 218 Define Schema for Spark Metastore Tables using StructType

In [29]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config('spark.driver.memory','6g'). \
        config('spark.executor.memory','6g'). \
        config('spark.dynamicAllocation.minExecutors', '4'). \
        enableHiveSupport(). \
        appName(f'{username} | Section 18 Spark Metastore'). \
        master('yarn'). \
        getOrCreate()

In [30]:
spark.conf.set("spark.sql.shuffle.partitions","2")

In [31]:
import getpass

username = getpass.getuser()

In [32]:
username

'itv011204'

In [33]:
spark.catalog.currentDatabase()

'itv011204_airtraffic'

In [35]:
spark.sql(f"DROP DATABASE IF EXISTS {username}_hr_db CASCADE")

In [36]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {username}_hr_db")

In [37]:
spark.catalog.setCurrentDatabase(f'{username}_hr_db')

In [38]:
spark.catalog.currentDatabase()

'itv011204_hr_db'

In [39]:
spark.catalog.createTable?

[0;31mSignature:[0m
[0mspark[0m[0;34m.[0m[0mcatalog[0m[0;34m.[0m[0mcreateTable[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtableName[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpath[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msource[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mschema[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdescription[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0moptions[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Creates a table based on the dataset in a data source.

It returns the DataFrame associated with the table.

The data source is specified by the ``source`` and a set of ``options``.
If ``source`` is not specified, the default data source configured by
``spark.sql.sources.default`` will be used. When ``path`` is specified, an external table is
created from the dat

In [44]:
from pyspark.sql.types import StructType,StructField, \
                                    StringType, IntegerType, FloatType

In [46]:
employeesSchema = StructType([
    StructField('employee_id',IntegerType()),
    StructField('first_name',StringType()),
    StructField('last_name',StringType()),
    StructField('salary',FloatType()),
    StructField('nationality',StringType())
])

In [47]:
employeesSchema

StructType(List(StructField(employee_id,IntegerType,true),StructField(first_name,StringType,true),StructField(last_name,StringType,true),StructField(salary,FloatType,true),StructField(nationality,StringType,true)))

In [48]:
help(employeesSchema)

Help on StructType in module pyspark.sql.types object:

class StructType(DataType)
 |  Struct type, consisting of a list of :class:`StructField`.
 |  
 |  This is the data type representing a :class:`Row`.
 |  
 |  Iterating a :class:`StructType` will iterate over its :class:`StructField`\s.
 |  A contained :class:`StructField` can be accessed by its name or position.
 |  
 |  Examples
 |  --------
 |  >>> struct1 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct1["f1"]
 |  StructField(f1,StringType,true)
 |  >>> struct1[0]
 |  StructField(f1,StringType,true)
 |  
 |  >>> struct1 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct2 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct1 == struct2
 |  True
 |  >>> struct1 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct2 = StructType([StructField("f1", StringType(), True),
 |  ...     StructField("f2", IntegerType(), False)])
 |  >>> struct1 == struct2
 |  False
 |  

In [49]:
employeesSchema.simpleString()

'struct<employee_id:int,first_name:string,last_name:string,salary:float,nationality:string>'

In [50]:
spark.catalog.createTable?

[0;31mSignature:[0m
[0mspark[0m[0;34m.[0m[0mcatalog[0m[0;34m.[0m[0mcreateTable[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtableName[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpath[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msource[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mschema[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdescription[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0moptions[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Creates a table based on the dataset in a data source.

It returns the DataFrame associated with the table.

The data source is specified by the ``source`` and a set of ``options``.
If ``source`` is not specified, the default data source configured by
``spark.sql.sources.default`` will be used. When ``path`` is specified, an external table is
created from the dat

In [51]:
spark.catalog.currentDatabase()

'itv011204_hr_db'

In [52]:
spark.catalog.createTable(
    'employees',
    schema=employeesSchema
)

employee_id,first_name,last_name,salary,nationality


In [53]:
spark.catalog.listTables()

[Table(name='employees', database='itv011204_hr_db', description=None, tableType='MANAGED', isTemporary=False)]

In [54]:
spark.catalog.listTables(f'{username}_demo_db')

[Table(name='dual', database='itv011204_demo_db', description=None, tableType='MANAGED', isTemporary=False)]

In [55]:
spark.catalog.listTables(f'{username}_airtraffic')

[Table(name='airport_codes', database='itv011204_airtraffic', description=None, tableType='EXTERNAL', isTemporary=False)]

In [56]:
spark.catalog.listColumns('employees')

[Column(name='employee_id', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='first_name', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='last_name', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='salary', description=None, dataType='float', nullable=True, isPartition=False, isBucket=False),
 Column(name='nationality', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False)]

In [58]:
spark.sql("SHOW TABLES").show()

+---------------+---------+-----------+
|       database|tableName|isTemporary|
+---------------+---------+-----------+
|itv011204_hr_db|employees|      false|
+---------------+---------+-----------+



In [60]:
spark.sql("DESCRIBE employees").show()

+-----------+---------+-------+
|   col_name|data_type|comment|
+-----------+---------+-------+
|employee_id|      int|   null|
| first_name|   string|   null|
|  last_name|   string|   null|
|     salary|    float|   null|
|nationality|   string|   null|
+-----------+---------+-------+



In [62]:
spark.sql("DESCRIBE EXTENDED employees").show(100,truncate=False)

+----------------------------+-----------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                          |comment|
+----------------------------+-----------------------------------------------------------------------------------+-------+
|employee_id                 |int                                                                                |null   |
|first_name                  |string                                                                             |null   |
|last_name                   |string                                                                             |null   |
|salary                      |float                                                                              |null   |
|nationality                 |string                                                                             |null   |
|               

In [70]:
spark.sql("DESCRIBE FORMATTED employees").show(100,truncate=False)

+----------------------------+-----------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                          |comment|
+----------------------------+-----------------------------------------------------------------------------------+-------+
|employee_id                 |int                                                                                |null   |
|first_name                  |string                                                                             |null   |
|last_name                   |string                                                                             |null   |
|salary                      |float                                                                              |null   |
|nationality                 |string                                                                             |null   |
|               

In [65]:
spark.sql("SHOW CREATE TABLE employees").show(100,truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|createtab_stmt                                                                                                                                                         |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|CREATE TABLE `itv011204_hr_db`.`employees` (
  `employee_id` INT,
  `first_name` STRING,
  `last_name` STRING,
  `salary` FLOAT,
  `nationality` STRING)
USING parquet
|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+



In [68]:
spark.sql(f"SHOW TABLES FROM {username}_airtraffic").show(100,truncate=False)

+--------------------+-------------+-----------+
|database            |tableName    |isTemporary|
+--------------------+-------------+-----------+
|itv011204_airtraffic|airport_codes|false      |
+--------------------+-------------+-----------+



In [69]:
spark.sql(f"SHOW CREATE TABLE {username}_airtraffic.airport_codes").show(100,truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|createtab_stmt                                                                                                                                                                                                                                                                                 |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|CREATE TABLE `itv011204_airtraffic`.`airport_codes` (
  `City` STRING,
  `State` STRING,
  `Country` STRING,
  `IATA` STRING)
USI

### 219 Insering into Existing Spark Metastore tables using Spark Data Frame APIs

In [1]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config('spark.driver.memory','6g'). \
        config('spark.executor.memory','6g'). \
        config('spark.dynamicAllocation.minExecutors', '4'). \
        enableHiveSupport(). \
        appName(f'{username} | Section 18 Spark Metastore'). \
        master('yarn'). \
        getOrCreate()

In [2]:
spark.conf.set("spark.sql.shuffle.partitions","2")

In [3]:
import getpass

username = getpass.getuser()

In [4]:
username

'itv011204'

In [5]:
spark.catalog.currentDatabase()

'default'

In [6]:
spark.catalog.setCurrentDatabase(f'{username}_hr_db')

In [7]:
spark.catalog.currentDatabase()

'itv011204_hr_db'

In [12]:
spark.sql("TRUNCATE TABLE employees")

In [13]:
spark.catalog.listTables()

[Table(name='employees', database='itv011204_hr_db', description=None, tableType='MANAGED', isTemporary=False)]

In [14]:
spark.catalog.listColumns('employees')

[Column(name='employee_id', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='first_name', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='last_name', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='salary', description=None, dataType='float', nullable=True, isPartition=False, isBucket=False),
 Column(name='nationality', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False)]

In [15]:
employees = [(1, "Scott", "Tiger", 1000.0, "united states"),
             (2, "Henry", "Ford", 1250.0, "India"),
             (3, "Nick", "Junior", 750.0, "united KINGDOM"),
             (4, "Bill", "Gomes", 1500.0, "AUSTRALIA")
            ]

In [16]:
spark.read.table('employees')

employee_id,first_name,last_name,salary,nationality


In [17]:
spark.read.table('employees').schema

StructType(List(StructField(employee_id,IntegerType,true),StructField(first_name,StringType,true),StructField(last_name,StringType,true),StructField(salary,FloatType,true),StructField(nationality,StringType,true)))

In [18]:
employeesDF = spark. \
    createDataFrame(employees,
        schema = """
                    employee_id INT, first_name STRING, last_name STRING,
                    salary FLOAT, nationality STRING
                """
    )

In [19]:
employeesDF.show()

+-----------+----------+---------+------+--------------+
|employee_id|first_name|last_name|salary|   nationality|
+-----------+----------+---------+------+--------------+
|          1|     Scott|    Tiger|1000.0| united states|
|          2|     Henry|     Ford|1250.0|         India|
|          3|      Nick|   Junior| 750.0|united KINGDOM|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|
+-----------+----------+---------+------+--------------+



In [20]:
employeesDF.schema

StructType(List(StructField(employee_id,IntegerType,true),StructField(first_name,StringType,true),StructField(last_name,StringType,true),StructField(salary,FloatType,true),StructField(nationality,StringType,true)))

In [21]:
spark.read.table('employees').schema

StructType(List(StructField(employee_id,IntegerType,true),StructField(first_name,StringType,true),StructField(last_name,StringType,true),StructField(salary,FloatType,true),StructField(nationality,StringType,true)))

In [22]:
employeesDF.write.insertInto("employees",overwrite=True)

In [23]:
spark.sql("SELECT * FROM employees")

employee_id,first_name,last_name,salary,nationality
3,Nick,Junior,750.0,united KINGDOM
1,Scott,Tiger,1000.0,united states
4,Bill,Gomes,1500.0,AUSTRALIA
2,Henry,Ford,1250.0,India


In [24]:
spark.read.table('employees').show()

+-----------+----------+---------+------+--------------+
|employee_id|first_name|last_name|salary|   nationality|
+-----------+----------+---------+------+--------------+
|          3|      Nick|   Junior| 750.0|united KINGDOM|
|          1|     Scott|    Tiger|1000.0| united states|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|
|          2|     Henry|     Ford|1250.0|         India|
+-----------+----------+---------+------+--------------+



### 220 Read and Process Data from Metastore Tables using DF APIs

In [1]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config('spark.driver.memory','6g'). \
        config('spark.executor.memory','6g'). \
        config('spark.dynamicAllocation.minExecutors', '4'). \
        enableHiveSupport(). \
        appName(f'{username} | Section 18 Spark Metastore'). \
        master('yarn'). \
        getOrCreate()

In [2]:
spark.conf.set("spark.sql.shuffle.partitions","2")

In [3]:
import getpass

username = getpass.getuser()

In [4]:
username

'itv011204'

In [5]:
spark.catalog.currentDatabase()

'default'

In [13]:
spark.catalog.listFunctions()

[Function(name='!', description=None, className='org.apache.spark.sql.catalyst.expressions.Not', isTemporary=True),
 Function(name='%', description=None, className='org.apache.spark.sql.catalyst.expressions.Remainder', isTemporary=True),
 Function(name='&', description=None, className='org.apache.spark.sql.catalyst.expressions.BitwiseAnd', isTemporary=True),
 Function(name='*', description=None, className='org.apache.spark.sql.catalyst.expressions.Multiply', isTemporary=True),
 Function(name='+', description=None, className='org.apache.spark.sql.catalyst.expressions.Add', isTemporary=True),
 Function(name='-', description=None, className='org.apache.spark.sql.catalyst.expressions.Subtract', isTemporary=True),
 Function(name='/', description=None, className='org.apache.spark.sql.catalyst.expressions.Divide', isTemporary=True),
 Function(name='<', description=None, className='org.apache.spark.sql.catalyst.expressions.LessThan', isTemporary=True),
 Function(name='<=', description=None, cl

In [15]:
spark.sql(f"DROP DATABASE IF EXISTS {username}_airlines CASCADE")

In [16]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {username}_airlines")

In [17]:
spark.catalog.setCurrentDatabase(f'{username}_airlines')

In [18]:
spark.catalog.currentDatabase()

'itv011204_airlines'

In [19]:
airports_codes_path = f"/user/{username}/airtraffic_all/airport-codes"

In [20]:
spark.sql(f"DROP TABLE IF EXISTS {username}_airlines.airport_codes")

In [21]:
airport_codes_df =spark. \
    read. \
    csv(airports_codes_path,
       sep='\t',
       header=True,
       inferSchema=True)

In [22]:
airport_codes_df.count()

526

In [23]:
airport_codes_df.show(5)

+----------+-----+-------+----+
|      City|State|Country|IATA|
+----------+-----+-------+----+
|Abbotsford|   BC| Canada| YXX|
|  Aberdeen|   SD|    USA| ABR|
|   Abilene|   TX|    USA| ABI|
|     Akron|   OH|    USA| CAK|
|   Alamosa|   CO|    USA| ALS|
+----------+-----+-------+----+
only showing top 5 rows



In [24]:
type(airport_codes_df)

pyspark.sql.dataframe.DataFrame

In [25]:
airport_codes_df.write.saveAsTable("airport_codes")

In [26]:
airport_codes = spark.read.table("airport_codes")

In [27]:
type(airport_codes)

pyspark.sql.dataframe.DataFrame

In [28]:
spark.sql("DESCRIBE FORMATTED airport_codes").show(100,False)

+----------------------------+------------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                                 |comment|
+----------------------------+------------------------------------------------------------------------------------------+-------+
|City                        |string                                                                                    |null   |
|State                       |string                                                                                    |null   |
|Country                     |string                                                                                    |null   |
|IATA                        |string                                                                                    |null   |
|                            |                                                            

In [29]:
airport_codes.printSchema()

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- IATA: string (nullable = true)



In [30]:
airport_codes.\
    groupBy('State'). \
    count(). \
    show()

+-----+-----+
|State|count|
+-----+-----+
|   BC|   22|
|   SD|    7|
|   NY|   18|
|   NM|    9|
|   NE|    9|
|   MI|   18|
|  NWT|    4|
|   NC|   10|
|   NJ|    3|
|   MD|    3|
|   WV|    8|
|   MN|    8|
|   IL|   12|
|   ID|    6|
|   IA|    8|
|   MO|    8|
|   SC|    6|
|   VA|    7|
|  PEI|    1|
|   TN|    6|
+-----+-----+
only showing top 20 rows



In [31]:
from pyspark.sql.functions import count,lit, col

In [32]:
airport_codes.\
    groupBy('State'). \
    agg(count(lit(1)).alias('AirportCount')). \
    orderBy(col("AirportCount").desc()). \
    show()

+-----+------------+
|State|AirportCount|
+-----+------------+
|   CA|          29|
|   TX|          26|
|   AK|          25|
|   BC|          22|
|   NY|          18|
|   MI|          18|
|   FL|          18|
|   ON|          18|
|   MT|          14|
|   PA|          13|
|   PQ|          13|
|   IL|          12|
|   CO|          12|
|   WY|          10|
|   NC|          10|
|   NM|           9|
|   NE|           9|
|   GA|           9|
|   KS|           9|
|   WA|           9|
+-----+------------+
only showing top 20 rows



### 221 Create Spark Metastore Partitioned Tables using Data Frame APIs

In [41]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
        config('spark.sql.ui.port','0'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config('spark.driver.memory','6g'). \
        config('spark.executor.memory','6g'). \
        config('spark.dynamicAllocation.minExecutors', '4'). \
        enableHiveSupport(). \
        appName(f'{username} | Section 18 Exploring spark catalog'). \
        master('yarn'). \
        getOrCreate()
        

In [2]:
spark.catalog.createTable?

[0;31mSignature:[0m
[0mspark[0m[0;34m.[0m[0mcatalog[0m[0;34m.[0m[0mcreateTable[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtableName[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpath[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msource[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mschema[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdescription[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0moptions[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Creates a table based on the dataset in a data source.

It returns the DataFrame associated with the table.

The data source is specified by the ``source`` and a set of ``options``.
If ``source`` is not specified, the default data source configured by
``spark.sql.sources.default`` will be used. When ``path`` is specified, an external table is
created from the dat

In [10]:
username

'itv011204'

In [11]:
spark.conf.set('spark.sql.shuffle.partitions','2')

In [12]:
spark.sql(f'DROP DATABASE IF EXISTS {username}_retail CASCADE')

In [13]:
spark.sql(f'CREATE DATABASE IF NOT EXISTS {username}_retail')

In [14]:
spark.catalog.setCurrentDatabase(f'{username}_retail')

In [15]:
orders_path  ='/public/retail_db/orders'

In [16]:
%%sh

hdfs dfs -ls /public/retail_db/orders

Found 1 items
-rw-r--r--   2 hdfs supergroup    2999944 2021-01-28 09:27 /public/retail_db/orders/part-00000


In [18]:
spark.sql("DROP TABLE IF EXISTS orders_part")

In [None]:
%%sh
hdfs dfs -ls /user/`whoami`/retail_db/orders_part

In [21]:
%%sh

hdfs dfs -rm -R /user/`whoami`/retail_db/orders_part

rm: `/user/itv011204/retail_db/orders_part': No such file or directory


CalledProcessError: Command 'b'\nhdfs dfs -rm -R /user/`whoami`/retail_db/orders_part\n'' returned non-zero exit status 1.

In [23]:
from pyspark.sql.functions import date_format, col

In [24]:
spark. \
    read. \
    csv(
        orders_path,
        schema = """order_id INT, order_date DATE, 
                order_customer_id INT, order_status STRING"""
    ). \
    withColumn('order_month',date_format(col('order_date'),'yyyyMM')). \
    write. \
    partitionBy('order_month'). \
    parquet(f'/user/{username}/retail_db/orders_part')

In [25]:
%%sh

hdfs dfs -ls /user/`whoami`/retail_db/orders_part

Found 14 items
-rw-r--r--   3 itv011204 supergroup          0 2024-02-15 09:47 /user/itv011204/retail_db/orders_part/_SUCCESS
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:46 /user/itv011204/retail_db/orders_part/order_month=201307
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:46 /user/itv011204/retail_db/orders_part/order_month=201308
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:46 /user/itv011204/retail_db/orders_part/order_month=201309
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:46 /user/itv011204/retail_db/orders_part/order_month=201310
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:46 /user/itv011204/retail_db/orders_part/order_month=201311
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:47 /user/itv011204/retail_db/orders_part/order_month=201312
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:47 /user/itv011204/retail_db/orders_part/order_month=201401
drwxr-xr-x   - itv011204 su

In [26]:
%%sh

hdfs dfs -ls -R /user/`whoami`/retail_db/orders_part

-rw-r--r--   3 itv011204 supergroup          0 2024-02-15 09:47 /user/itv011204/retail_db/orders_part/_SUCCESS
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:46 /user/itv011204/retail_db/orders_part/order_month=201307
-rw-r--r--   3 itv011204 supergroup      14435 2024-02-15 09:46 /user/itv011204/retail_db/orders_part/order_month=201307/part-00000-ea10ebc3-7549-4715-904e-075a4a0165aa.c000.snappy.parquet
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:46 /user/itv011204/retail_db/orders_part/order_month=201308
-rw-r--r--   3 itv011204 supergroup      49997 2024-02-15 09:46 /user/itv011204/retail_db/orders_part/order_month=201308/part-00000-ea10ebc3-7549-4715-904e-075a4a0165aa.c000.snappy.parquet
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:46 /user/itv011204/retail_db/orders_part/order_month=201309
-rw-r--r--   3 itv011204 supergroup      51358 2024-02-15 09:46 /user/itv011204/retail_db/orders_part/order_month=201309/part-00000-ea10ebc3-7549-47

In [27]:
spark.read.parquet(f'/user/{username}/retail_db/orders_part/order_month=201307').show(5)

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             8827|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
+--------+----------+-----------------+---------------+
only showing top 5 rows



In [31]:
spark.read.parquet(f'/user/{username}/retail_db/orders_part/order_month=201307').count()

1533

In [28]:
spark.read.parquet(f'/user/itv011204/retail_db/orders_part/order_month=201405').show(5)

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|   45168|2014-05-01|             2383|       COMPLETE|
|   45169|2014-05-01|             7212|PENDING_PAYMENT|
|   45170|2014-05-01|             2400|SUSPECTED_FRAUD|
|   45171|2014-05-01|             9003|PENDING_PAYMENT|
|   45172|2014-05-01|             2508|PENDING_PAYMENT|
+--------+----------+-----------------+---------------+
only showing top 5 rows



In [32]:
spark.read.parquet(f'/user/itv011204/retail_db/orders_part/order_month=201405').count()

5467

In [30]:
spark.read.parquet(f'/user/itv011204/retail_db/orders_part').show()

+--------+----------+-----------------+---------------+-----------+
|order_id|order_date|order_customer_id|   order_status|order_month|
+--------+----------+-----------------+---------------+-----------+
|   15488|2013-11-01|             8987|PENDING_PAYMENT|     201311|
|   15489|2013-11-01|             5359|PENDING_PAYMENT|     201311|
|   15490|2013-11-01|            10149|       COMPLETE|     201311|
|   15491|2013-11-01|            10635|        ON_HOLD|     201311|
|   15492|2013-11-01|             7784|PENDING_PAYMENT|     201311|
|   15493|2013-11-01|             1104|        ON_HOLD|     201311|
|   15494|2013-11-01|             7313|     PROCESSING|     201311|
|   15495|2013-11-01|             7067|         CLOSED|     201311|
|   15496|2013-11-01|            12153|PENDING_PAYMENT|     201311|
|   15497|2013-11-01|            11115|PENDING_PAYMENT|     201311|
|   15498|2013-11-01|            11195|       COMPLETE|     201311|
|   15499|2013-11-01|             7113|         

In [33]:
spark.read.parquet(f'/user/itv011204/retail_db/orders_part').count()

68883

In [35]:
spark. \
    catalog. \
    createTable('orders_part',
        path=f'/user/{username}/retail_db/orders_part',
        source='parquet'
    )

order_id,order_date,order_customer_id,order_status,order_month


In [36]:
spark.read.table('orders_part').show()

+--------+----------+-----------------+------------+-----------+
|order_id|order_date|order_customer_id|order_status|order_month|
+--------+----------+-----------------+------------+-----------+
+--------+----------+-----------------+------------+-----------+



In [37]:
spark.sql("SHOW PARTITIONS orders_part").show()

+---------+
|partition|
+---------+
+---------+



In [38]:
spark.catalog.recoverPartitions('orders_part')

In [40]:
spark.sql("SHOW PARTITIONS orders_part").show()

+------------------+
|         partition|
+------------------+
|order_month=201307|
|order_month=201308|
|order_month=201309|
|order_month=201310|
|order_month=201311|
|order_month=201312|
|order_month=201401|
|order_month=201402|
|order_month=201403|
|order_month=201404|
|order_month=201405|
|order_month=201406|
|order_month=201407|
+------------------+



In [42]:
spark.read.table('orders_part').show()

+--------+----------+-----------------+---------------+-----------+
|order_id|order_date|order_customer_id|   order_status|order_month|
+--------+----------+-----------------+---------------+-----------+
|   15488|2013-11-01|             8987|PENDING_PAYMENT|     201311|
|   15489|2013-11-01|             5359|PENDING_PAYMENT|     201311|
|   15490|2013-11-01|            10149|       COMPLETE|     201311|
|   15491|2013-11-01|            10635|        ON_HOLD|     201311|
|   15492|2013-11-01|             7784|PENDING_PAYMENT|     201311|
|   15493|2013-11-01|             1104|        ON_HOLD|     201311|
|   15494|2013-11-01|             7313|     PROCESSING|     201311|
|   15495|2013-11-01|             7067|         CLOSED|     201311|
|   15496|2013-11-01|            12153|PENDING_PAYMENT|     201311|
|   15497|2013-11-01|            11115|PENDING_PAYMENT|     201311|
|   15498|2013-11-01|            11195|       COMPLETE|     201311|
|   15499|2013-11-01|             7113|         

In [43]:
spark.sql("SELECT order_month, count(1) FROM orders_part GROUP BY order_month").show()

+-----------+--------+
|order_month|count(1)|
+-----------+--------+
|     201405|    5467|
|     201308|    5680|
|     201404|    5657|
|     201311|    6381|
|     201401|    5908|
|     201309|    5841|
|     201312|    5892|
|     201403|    5778|
|     201402|    5635|
|     201310|    5335|
|     201406|    5308|
|     201407|    4468|
|     201307|    1533|
+-----------+--------+



In [45]:
spark.sql("DESCRIBE orders_part").show(truncate=False)

+-----------------------+---------+-------+
|col_name               |data_type|comment|
+-----------------------+---------+-------+
|order_id               |int      |null   |
|order_date             |date     |null   |
|order_customer_id      |int      |null   |
|order_status           |string   |null   |
|order_month            |int      |null   |
|# Partition Information|         |       |
|# col_name             |data_type|comment|
|order_month            |int      |null   |
+-----------------------+---------+-------+



In [47]:
spark.read.table('orders_part'). \
    groupBy('order_month'). \
    count(). \
    show()

+-----------+-----+
|order_month|count|
+-----------+-----+
|     201311| 6381|
|     201401| 5908|
|     201309| 5841|
|     201308| 5680|
|     201404| 5657|
|     201405| 5467|
|     201310| 5335|
|     201406| 5308|
|     201407| 4468|
|     201403| 5778|
|     201402| 5635|
|     201307| 1533|
|     201312| 5892|
+-----------+-----+



### 222 Saving as Spark Metastore Partitioned table using Data Frame APIs

In [1]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
        config('spark.sql.ui.port','0'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config('spark.driver.memory','6g'). \
        config('spark.executor.memory','6g'). \
        config('spark.dynamicAllocation.minExecutors', '4'). \
        enableHiveSupport(). \
        appName(f'{username} | Section 18 Exploring spark catalog'). \
        master('yarn'). \
        getOrCreate()
        

In [2]:
spark.conf.set('spark.sql.shuffle.partitions','2')

In [3]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {username}_retail")

In [4]:
spark.catalog.setCurrentDatabase(f'{username}_retail')

In [5]:
spark.catalog.currentDatabase()

'itv011204_retail'

In [6]:
orders_path = '/public/retail_db/orders'

In [7]:
%%sh

hdfs dfs -ls /public/retail_db/orders

Found 1 items
-rw-r--r--   2 hdfs supergroup    2999944 2021-01-28 09:27 /public/retail_db/orders/part-00000


In [8]:
%%sh

hdfs dfs -tail /public/retail_db/orders/part-00000

014-06-12 00:00:00.0,4229,PENDING
68861,2014-06-13 00:00:00.0,3031,PENDING_PAYMENT
68862,2014-06-15 00:00:00.0,7326,PROCESSING
68863,2014-06-16 00:00:00.0,3361,CLOSED
68864,2014-06-18 00:00:00.0,9634,ON_HOLD
68865,2014-06-19 00:00:00.0,4567,SUSPECTED_FRAUD
68866,2014-06-20 00:00:00.0,3890,PENDING_PAYMENT
68867,2014-06-23 00:00:00.0,869,CANCELED
68868,2014-06-24 00:00:00.0,10184,PENDING
68869,2014-06-25 00:00:00.0,7456,PROCESSING
68870,2014-06-26 00:00:00.0,3343,COMPLETE
68871,2014-06-28 00:00:00.0,4960,PENDING
68872,2014-06-29 00:00:00.0,3354,COMPLETE
68873,2014-06-30 00:00:00.0,4545,PENDING
68874,2014-07-03 00:00:00.0,1601,COMPLETE
68875,2014-07-04 00:00:00.0,10637,ON_HOLD
68876,2014-07-06 00:00:00.0,4124,COMPLETE
68877,2014-07-07 00:00:00.0,9692,ON_HOLD
68878,2014-07-08 00:00:00.0,6753,COMPLETE
68879,2014-07-09 00:00:00.0,778,COMPLETE
68880,2014-07-13 00:00:00.0,1117,COMPLETE
68881,2014-07-19 00:00:00.0,2518,PENDING_PAYMENT
68882,2014-07-22 00:00:00.0,10000,ON_HOLD
68883,2014-07-23 0

In [9]:
spark.sql("DROP TABLE IF EXISTS orders_part")

In [10]:
from pyspark.sql.functions import date_format

In [11]:
orders = spark. \
    read. \
    csv(orders_path,
        schema = """order_id INT, order_date DATE,
                order_customer_id INT, order_status STRING
                """
    ). \
    withColumn('order_month',date_format('order_date','yyyyMM'))

In [12]:
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_month: string (nullable = true)



In [13]:
orders.show(5)

+--------+----------+-----------------+---------------+-----------+
|order_id|order_date|order_customer_id|   order_status|order_month|
+--------+----------+-----------------+---------------+-----------+
|       1|2013-07-25|            11599|         CLOSED|     201307|
|       2|2013-07-25|              256|PENDING_PAYMENT|     201307|
|       3|2013-07-25|            12111|       COMPLETE|     201307|
|       4|2013-07-25|             8827|         CLOSED|     201307|
|       5|2013-07-25|            11318|       COMPLETE|     201307|
+--------+----------+-----------------+---------------+-----------+
only showing top 5 rows



In [14]:
orders. \
    write. \
    saveAsTable(
        'orders_part',
        mode='overwrite',
        partitionBy='order_month'
    )

In [15]:
%%sh 

hdfs dfs -ls -R /user/`whoami`/warehouse/`whoami`_retail.db/orders_part

-rw-r--r--   3 itv011204 supergroup          0 2024-02-15 10:25 /user/itv011204/warehouse/itv011204_retail.db/orders_part/_SUCCESS
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 10:25 /user/itv011204/warehouse/itv011204_retail.db/orders_part/order_month=201307
-rw-r--r--   3 itv011204 supergroup      14435 2024-02-15 10:25 /user/itv011204/warehouse/itv011204_retail.db/orders_part/order_month=201307/part-00000-31ef1390-ce85-4d27-878a-3da475261aa5.c000.snappy.parquet
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 10:25 /user/itv011204/warehouse/itv011204_retail.db/orders_part/order_month=201308
-rw-r--r--   3 itv011204 supergroup      49997 2024-02-15 10:25 /user/itv011204/warehouse/itv011204_retail.db/orders_part/order_month=201308/part-00000-31ef1390-ce85-4d27-878a-3da475261aa5.c000.snappy.parquet
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 10:25 /user/itv011204/warehouse/itv011204_retail.db/orders_part/order_month=201309
-rw-r--r--   3 itv011204 sup

In [19]:
spark.read.parquet(f"/user/{username}/warehouse/{username}_retail.db/orders_part/order_month=201307").show(5)

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             8827|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
+--------+----------+-----------------+---------------+
only showing top 5 rows



In [20]:
spark.read.parquet(f"/user/{username}/warehouse/{username}_retail.db/orders_part").show(5)

+--------+----------+-----------------+---------------+-----------+
|order_id|order_date|order_customer_id|   order_status|order_month|
+--------+----------+-----------------+---------------+-----------+
|   15488|2013-11-01|             8987|PENDING_PAYMENT|     201311|
|   15489|2013-11-01|             5359|PENDING_PAYMENT|     201311|
|   15490|2013-11-01|            10149|       COMPLETE|     201311|
|   15491|2013-11-01|            10635|        ON_HOLD|     201311|
|   15492|2013-11-01|             7784|PENDING_PAYMENT|     201311|
+--------+----------+-----------------+---------------+-----------+
only showing top 5 rows



In [22]:
spark.sql("SHOW PARTITIONS orders_part").show()

+------------------+
|         partition|
+------------------+
|order_month=201307|
|order_month=201308|
|order_month=201309|
|order_month=201310|
|order_month=201311|
|order_month=201312|
|order_month=201401|
|order_month=201402|
|order_month=201403|
|order_month=201404|
|order_month=201405|
|order_month=201406|
|order_month=201407|
+------------------+



In [24]:
spark.read.table('orders_part').show(5)

+--------+----------+-----------------+---------------+-----------+
|order_id|order_date|order_customer_id|   order_status|order_month|
+--------+----------+-----------------+---------------+-----------+
|   15488|2013-11-01|             8987|PENDING_PAYMENT|     201311|
|   15489|2013-11-01|             5359|PENDING_PAYMENT|     201311|
|   15490|2013-11-01|            10149|       COMPLETE|     201311|
|   15491|2013-11-01|            10635|        ON_HOLD|     201311|
|   15492|2013-11-01|             7784|PENDING_PAYMENT|     201311|
+--------+----------+-----------------+---------------+-----------+
only showing top 5 rows



In [28]:
spark.sql("SELECT order_month, count(1) from orders_part GROUP BY order_month").show(5)

+-----------+--------+
|order_month|count(1)|
+-----------+--------+
|     201403|    5778|
|     201308|    5680|
|     201404|    5657|
|     201406|    5308|
|     201402|    5635|
+-----------+--------+
only showing top 5 rows



In [30]:
spark. \
    read. \
    table("orders_part"). \
    groupBy('order_month'). \
    count(). \
    show(5)

+-----------+-----+
|order_month|count|
+-----------+-----+
|     201406| 5308|
|     201403| 5778|
|     201308| 5680|
|     201404| 5657|
|     201311| 6381|
+-----------+-----+
only showing top 5 rows



In [31]:
### 223 Creating Temporary views on top of Spark Data Frames

In [1]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
        config('spark.sql.ui.port','0'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config('spark.driver.memory','6g'). \
        config('spark.executor.memory','6g'). \
        config('spark.dynamicAllocation.minExecutors', '4'). \
        enableHiveSupport(). \
        appName(f'{username} | Section 18 Exploring spark catalog'). \
        master('yarn'). \
        getOrCreate()
        

In [2]:
spark.conf.set('spark.sql.shuffle.partitions','2')

In [3]:
spark.catalog.setCurrentDatabase(f'{username}_airlines')

In [4]:
spark.catalog.currentDatabase()

'itv011204_airlines'

In [5]:
spark.catalog.listTables()

[Table(name='airport_codes', database='itv011204_airlines', description=None, tableType='MANAGED', isTemporary=False)]

In [6]:
airport_cods_path = f"/public/airlines_all/airport-codes"

In [7]:
airport_codes_df = spark. \
    read. \
    csv(
        airport_cods_path,
        sep = '\t',
        header = True,
        inferSchema = True
    )

In [8]:
airport_codes_df.count()

526

In [9]:
airport_codes_df.printSchema()

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- IATA: string (nullable = true)



In [10]:
airport_codes_df.show(5)

+----------+-----+-------+----+
|      City|State|Country|IATA|
+----------+-----+-------+----+
|Abbotsford|   BC| Canada| YXX|
|  Aberdeen|   SD|    USA| ABR|
|   Abilene|   TX|    USA| ABI|
|     Akron|   OH|    USA| CAK|
|   Alamosa|   CO|    USA| ALS|
+----------+-----+-------+----+
only showing top 5 rows



In [11]:
spark.catalog.listTables()

[Table(name='airport_codes', database='itv011204_airlines', description=None, tableType='MANAGED', isTemporary=False)]

In [12]:
airport_codes_df.createTempView('airport_codes_v')

In [13]:
spark.catalog.listTables()

[Table(name='airport_codes', database='itv011204_airlines', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='airport_codes_v', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [14]:
help(airport_codes_df.createGlobalTempView)

Help on method createGlobalTempView in module pyspark.sql.dataframe:

createGlobalTempView(name) method of pyspark.sql.dataframe.DataFrame instance
    Creates a global temporary view with this :class:`DataFrame`.
    
    The lifetime of this temporary view is tied to this Spark application.
    throws :class:`TempTableAlreadyExistsException`, if the view name already exists in the
    catalog.
    
    .. versionadded:: 2.1.0
    
    Examples
    --------
    >>> df.createGlobalTempView("people")
    >>> df2 = spark.sql("select * from global_temp.people")
    >>> sorted(df.collect()) == sorted(df2.collect())
    True
    >>> df.createGlobalTempView("people")  # doctest: +IGNORE_EXCEPTION_DETAIL
    Traceback (most recent call last):
    ...
    AnalysisException: u"Temporary table 'people' already exists;"
    >>> spark.catalog.dropGlobalTempView("people")



In [15]:
airport_codes_df.createGlobalTempView('airport_codes_g')

In [16]:
spark.sql("show tables;")

database,tableName,isTemporary
itv011204_airlines,airport_codes,False
,airport_codes_v,True


In [17]:
spark.sql("show views in global_temp")

namespace,viewName,isTemporary
global_temp,airport_codes_g,True
,airport_codes_v,True


In [18]:
spark.sql("select * from global_temp.airport_codes_g").show(5)

+----------+-----+-------+----+
|      City|State|Country|IATA|
+----------+-----+-------+----+
|Abbotsford|   BC| Canada| YXX|
|  Aberdeen|   SD|    USA| ABR|
|   Abilene|   TX|    USA| ABI|
|     Akron|   OH|    USA| CAK|
|   Alamosa|   CO|    USA| ALS|
+----------+-----+-------+----+
only showing top 5 rows



In [19]:
spark.read.table('global_temp.airport_codes_g').show(5)

+----------+-----+-------+----+
|      City|State|Country|IATA|
+----------+-----+-------+----+
|Abbotsford|   BC| Canada| YXX|
|  Aberdeen|   SD|    USA| ABR|
|   Abilene|   TX|    USA| ABI|
|     Akron|   OH|    USA| CAK|
|   Alamosa|   CO|    USA| ALS|
+----------+-----+-------+----+
only showing top 5 rows



In [20]:
airport_codes = spark.read.table('airport_codes_v')

In [49]:
airport_codes. \
    groupBy("state"). \
    count(). \
    show()

+-----+-----+
|state|count|
+-----+-----+
|   BC|   22|
|   SD|    7|
|   NY|   18|
|   NM|    9|
|   NE|    9|
|   MI|   18|
|  NWT|    4|
|   NC|   10|
|   NJ|    3|
|   MD|    3|
|   WV|    8|
|   MN|    8|
|   IL|   12|
|   ID|    6|
|   IA|    8|
|   MO|    8|
|   SC|    6|
|   VA|    7|
|  PEI|    1|
|   TN|    6|
+-----+-----+
only showing top 20 rows



In [51]:
spark.sql("SELECT state, count(1) airport_count FROM airport_codes_v GROUP BY state ORDER BY airport_count desc").show()

+-----+-------------+
|state|airport_count|
+-----+-------------+
|   CA|           29|
|   TX|           26|
|   AK|           25|
|   BC|           22|
|   NY|           18|
|   ON|           18|
|   MI|           18|
|   FL|           18|
|   MT|           14|
|   PA|           13|
|   PQ|           13|
|   IL|           12|
|   CO|           12|
|   NC|           10|
|   WY|           10|
|   NE|            9|
|   WI|            9|
|   WA|            9|
|   GA|            9|
|   NM|            9|
+-----+-------------+
only showing top 20 rows



In [12]:
Cities = ['Chennai', 'Trichy', 'Madurai', 'Coimbatore']
Price = [10,20,30,40]

serial =1

for city in Cities:
    print("Hello"+" "+ city)
    serial = serial + 1

city_selected = int(input("Choose City by entering number"))
no_of_seats = int(input("Enter number of seats you want to book"))

print("price for "+no_of_seats+" to "+ Cities[city_selected-1] + " is "+Price[city_selected -1]*no_of_seats  )

Hello Chennai
Hello Trichy
Hello Madurai
Hello Coimbatore


Choose City by entering number 1
Enter number of seats you want to book 2


TypeError: must be str, not int

In [5]:
str1="Hello"
str2="World"
print ("String 1:",str1)
print ("String 2:",str2)
str=str1+str2

String 1: Hello
String 2: World


In [2]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
        config('spark.sql.ui.port','0'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config('spark.driver.memory','6g'). \
        config('spark.executor.memory','6g'). \
        config('spark.dynamicAllocation.minExecutors', '4'). \
        enableHiveSupport(). \
        appName(f'{username} | Section 18 Exploring spark catalog'). \
        master('yarn'). \
        getOrCreate()

In [3]:
username

'itv011204'

In [6]:
spark.sql(f'CREATE DATABASE IF NOT EXISTS {username}_retail')

In [22]:
spark.catalog.setCurrentDatabase(f'{username}_retail')

In [5]:
spark.catalog.currentDatabase()

'itv011204_retail'

In [9]:
spark.sql('DROP TABLE IF EXISTS orders_part2')

In [10]:
%%sh

hdfs dfs -ls /public/retail_db/orders

Found 1 items
-rw-r--r--   2 hdfs supergroup    2999944 2021-01-28 09:27 /public/retail_db/orders/part-00000


In [11]:
%%sh

hdfs dfs -ls /user/`whoami`/retail_db/orders_part

Found 14 items
-rw-r--r--   3 itv011204 supergroup          0 2024-02-15 09:47 /user/itv011204/retail_db/orders_part/_SUCCESS
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:46 /user/itv011204/retail_db/orders_part/order_month=201307
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:46 /user/itv011204/retail_db/orders_part/order_month=201308
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:46 /user/itv011204/retail_db/orders_part/order_month=201309
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:46 /user/itv011204/retail_db/orders_part/order_month=201310
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:46 /user/itv011204/retail_db/orders_part/order_month=201311
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:47 /user/itv011204/retail_db/orders_part/order_month=201312
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:47 /user/itv011204/retail_db/orders_part/order_month=201401
drwxr-xr-x   - itv011204 su

In [12]:
%%sh

hdfs dfs -rm -R -skipTrash /user/`whoami`/retail_db/orders_part

Deleted /user/itv011204/retail_db/orders_part


In [13]:
orders_path = "/public/retail_db/orders"

In [14]:
from pyspark.sql.functions import date_format

In [17]:
spark.read. \
    csv(
        orders_path,
        schema = """
                    order_id INT, order_date DATE,
                    order_customer_id INT, order_status STRING
                """
    ). \
    withColumn('order_year',date_format('order_date','yyyy')). \
    withColumn('order_month',date_format('order_date','MM')). \
    write. \
    partitionBy('order_year','order_month'). \
    parquet(f'/user/{username}/retail_db/orders_part2')

In [18]:
%%sh

hdfs dfs -ls /user/`whoami`/retail_db/orders_part2

Found 3 items
-rw-r--r--   3 itv011204 supergroup          0 2024-02-18 14:12 /user/itv011204/retail_db/orders_part2/_SUCCESS
drwxr-xr-x   - itv011204 supergroup          0 2024-02-18 14:12 /user/itv011204/retail_db/orders_part2/order_year=2013
drwxr-xr-x   - itv011204 supergroup          0 2024-02-18 14:12 /user/itv011204/retail_db/orders_part2/order_year=2014


In [19]:
%%sh

hdfs dfs -ls -R /user/`whoami`/retail_db/orders_part2

-rw-r--r--   3 itv011204 supergroup          0 2024-02-18 14:12 /user/itv011204/retail_db/orders_part2/_SUCCESS
drwxr-xr-x   - itv011204 supergroup          0 2024-02-18 14:12 /user/itv011204/retail_db/orders_part2/order_year=2013
drwxr-xr-x   - itv011204 supergroup          0 2024-02-18 14:12 /user/itv011204/retail_db/orders_part2/order_year=2013/order_month=07
-rw-r--r--   3 itv011204 supergroup      14435 2024-02-18 14:12 /user/itv011204/retail_db/orders_part2/order_year=2013/order_month=07/part-00000-344acf22-02e6-4a65-9d9b-be9ee31a6267.c000.snappy.parquet
drwxr-xr-x   - itv011204 supergroup          0 2024-02-18 14:12 /user/itv011204/retail_db/orders_part2/order_year=2013/order_month=08
-rw-r--r--   3 itv011204 supergroup      49997 2024-02-18 14:12 /user/itv011204/retail_db/orders_part2/order_year=2013/order_month=08/part-00000-344acf22-02e6-4a65-9d9b-be9ee31a6267.c000.snappy.parquet
drwxr-xr-x   - itv011204 supergroup          0 2024-02-18 14:12 /user/itv011204/retail_db/orders_

In [20]:
spark.read.parquet(f'/user/{username}/retail_db/orders_part2/order_year=2014/order_month=06').show(5)

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|   49733|2014-06-01|             8572|PENDING_PAYMENT|
|   49734|2014-06-01|             3644|       COMPLETE|
|   49735|2014-06-01|             9457|       COMPLETE|
|   49736|2014-06-01|             8753|         CLOSED|
|   49737|2014-06-01|             3977|PENDING_PAYMENT|
+--------+----------+-----------------+---------------+
only showing top 5 rows



In [21]:
spark.read.parquet(f'/user/{username}/retail_db/orders_part2/order_year=2014').show(5)

+--------+----------+-----------------+---------------+-----------+
|order_id|order_date|order_customer_id|   order_status|order_month|
+--------+----------+-----------------+---------------+-----------+
|   25876|2014-01-01|             3414|PENDING_PAYMENT|          1|
|   25877|2014-01-01|             5549|PENDING_PAYMENT|          1|
|   25878|2014-01-01|             9084|        PENDING|          1|
|   25879|2014-01-01|             5118|        PENDING|          1|
|   25880|2014-01-01|            10146|       CANCELED|          1|
+--------+----------+-----------------+---------------+-----------+
only showing top 5 rows



In [22]:
spark.read.parquet(f'/user/{username}/retail_db/orders_part2').show(5)

+--------+----------+-----------------+---------------+----------+-----------+
|order_id|order_date|order_customer_id|   order_status|order_year|order_month|
+--------+----------+-----------------+---------------+----------+-----------+
|   15488|2013-11-01|             8987|PENDING_PAYMENT|      2013|         11|
|   15489|2013-11-01|             5359|PENDING_PAYMENT|      2013|         11|
|   15490|2013-11-01|            10149|       COMPLETE|      2013|         11|
|   15491|2013-11-01|            10635|        ON_HOLD|      2013|         11|
|   15492|2013-11-01|             7784|PENDING_PAYMENT|      2013|         11|
+--------+----------+-----------------+---------------+----------+-----------+
only showing top 5 rows



In [24]:
spark. \
    catalog. \
    createTable(
        'orders_part2',
        path=f'/user/{username}/retail_db/orders_part2',
        source='parquet'        
    )

order_id,order_date,order_customer_id,order_status,order_year,order_month


In [25]:
spark.read.table('orders_part2')

order_id,order_date,order_customer_id,order_status,order_year,order_month


In [26]:
spark.sql("describe orders_part2")

col_name,data_type,comment
order_id,int,
order_date,date,
order_customer_id,int,
order_status,string,
order_year,int,
order_month,int,
# Partition Infor...,,
# col_name,data_type,comment
order_year,int,
order_month,int,


In [27]:
spark.sql("SHOW PARTITIONS orders_part2")

partition


In [29]:
spark.catalog.recoverPartitions('orders_part2')

In [31]:
spark.sql("show partitions orders_part2").show(truncate=False)

+------------------------------+
|partition                     |
+------------------------------+
|order_year=2013/order_month=07|
|order_year=2013/order_month=08|
|order_year=2013/order_month=09|
|order_year=2013/order_month=10|
|order_year=2013/order_month=11|
|order_year=2013/order_month=12|
|order_year=2014/order_month=01|
|order_year=2014/order_month=02|
|order_year=2014/order_month=03|
|order_year=2014/order_month=04|
|order_year=2014/order_month=05|
|order_year=2014/order_month=06|
|order_year=2014/order_month=07|
+------------------------------+



In [32]:
spark.read.table('orders_part2').show(5)

+--------+----------+-----------------+---------------+----------+-----------+
|order_id|order_date|order_customer_id|   order_status|order_year|order_month|
+--------+----------+-----------------+---------------+----------+-----------+
|   15488|2013-11-01|             8987|PENDING_PAYMENT|      2013|         11|
|   15489|2013-11-01|             5359|PENDING_PAYMENT|      2013|         11|
|   15490|2013-11-01|            10149|       COMPLETE|      2013|         11|
|   15491|2013-11-01|            10635|        ON_HOLD|      2013|         11|
|   15492|2013-11-01|             7784|PENDING_PAYMENT|      2013|         11|
+--------+----------+-----------------+---------------+----------+-----------+
only showing top 5 rows



In [36]:
spark.read. \
    csv(
        orders_path,
        schema = """
                    order_id INT, order_date DATE,
                    order_customer_id INT, order_status STRING
                """
    ). \
    withColumn('order_year',date_format('order_date','yyyy')). \
    withColumn('order_month',date_format('order_date','MM')). \
    write. \
    partitionBy('order_year','order_month'). \
    saveAsTable('orders_part3')

In [37]:
spark.catalog.listTables()

[Table(name='orders_part', database='itv011204_retail', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='orders_part2', database='itv011204_retail', description=None, tableType='EXTERNAL', isTemporary=False),
 Table(name='orders_part3', database='itv011204_retail', description=None, tableType='MANAGED', isTemporary=False)]

In [38]:
spark.sql("select * from orders_part3 limit 10").show()

+--------+----------+-----------------+---------------+----------+-----------+
|order_id|order_date|order_customer_id|   order_status|order_year|order_month|
+--------+----------+-----------------+---------------+----------+-----------+
|   15488|2013-11-01|             8987|PENDING_PAYMENT|      2013|         11|
|   15489|2013-11-01|             5359|PENDING_PAYMENT|      2013|         11|
|   15490|2013-11-01|            10149|       COMPLETE|      2013|         11|
|   15491|2013-11-01|            10635|        ON_HOLD|      2013|         11|
|   15492|2013-11-01|             7784|PENDING_PAYMENT|      2013|         11|
|   15493|2013-11-01|             1104|        ON_HOLD|      2013|         11|
|   15494|2013-11-01|             7313|     PROCESSING|      2013|         11|
|   15495|2013-11-01|             7067|         CLOSED|      2013|         11|
|   15496|2013-11-01|            12153|PENDING_PAYMENT|      2013|         11|
|   15497|2013-11-01|            11115|PENDING_PAYME

In [9]:
spark.sql("select count(1) from orders_part3").show()

+--------+
|count(1)|
+--------+
|   68883|
+--------+



In [10]:
spark.sql("describe orders_part3")

col_name,data_type,comment
order_id,int,
order_date,date,
order_customer_id,int,
order_status,string,
order_year,string,
order_month,string,
# Partition Infor...,,
# col_name,data_type,comment
order_year,string,
order_month,string,


In [20]:
%%sh

hdfs dfs -ls /user/`whoami`/warehouse/`whoami`_retail.db/orders_part3

Found 3 items
-rw-r--r--   3 itv011204 supergroup          0 2024-02-18 14:44 /user/itv011204/warehouse/itv011204_retail.db/orders_part3/_SUCCESS
drwxr-xr-x   - itv011204 supergroup          0 2024-02-18 14:44 /user/itv011204/warehouse/itv011204_retail.db/orders_part3/order_year=2013
drwxr-xr-x   - itv011204 supergroup          0 2024-02-18 14:44 /user/itv011204/warehouse/itv011204_retail.db/orders_part3/order_year=2014


In [23]:
spark.sql("describe formatted orders_part2").show(100,truncate=False)

+----------------------------+-------------------------------------------------------------------+-------+
|col_name                    |data_type                                                          |comment|
+----------------------------+-------------------------------------------------------------------+-------+
|order_id                    |int                                                                |null   |
|order_date                  |date                                                               |null   |
|order_customer_id           |int                                                                |null   |
|order_status                |string                                                             |null   |
|order_year                  |int                                                                |null   |
|order_month                 |int                                                                |null   |
|# Partition Information     |       

In [24]:
spark.sql("describe formatted orders_part").show(100,truncate=False)

+----------------------------+--------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                             |comment|
+----------------------------+--------------------------------------------------------------------------------------+-------+
|order_id                    |int                                                                                   |null   |
|order_date                  |date                                                                                  |null   |
|order_customer_id           |int                                                                                   |null   |
|order_status                |string                                                                                |null   |
|order_month                 |string                                                                                |n

In [15]:
spark.sql("describe formatted orders_part3").show(100,truncate=False)

+----------------------------+---------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                              |comment|
+----------------------------+---------------------------------------------------------------------------------------+-------+
|order_id                    |int                                                                                    |null   |
|order_date                  |date                                                                                   |null   |
|order_customer_id           |int                                                                                    |null   |
|order_status                |string                                                                                 |null   |
|order_year                  |string                                                                           

In [23]:
spark.read.parquet("/user/itv011204/warehouse/itv011204_retail.db/orders_part3/order_year=2013").show(5)

+--------+----------+-----------------+---------------+-----------+
|order_id|order_date|order_customer_id|   order_status|order_month|
+--------+----------+-----------------+---------------+-----------+
|   15488|2013-11-01|             8987|PENDING_PAYMENT|         11|
|   15489|2013-11-01|             5359|PENDING_PAYMENT|         11|
|   15490|2013-11-01|            10149|       COMPLETE|         11|
|   15491|2013-11-01|            10635|        ON_HOLD|         11|
|   15492|2013-11-01|             7784|PENDING_PAYMENT|         11|
+--------+----------+-----------------+---------------+-----------+
only showing top 5 rows



In [25]:
spark.read.parquet("/user/itv011204/warehouse/itv011204_retail.db/orders_part3/order_year=2013").count()

30662

In [22]:
spark.read.parquet("/user/itv011204/warehouse/itv011204_retail.db/orders_part3/order_year=2014").show(5)

+--------+----------+-----------------+---------------+-----------+
|order_id|order_date|order_customer_id|   order_status|order_month|
+--------+----------+-----------------+---------------+-----------+
|   25876|2014-01-01|             3414|PENDING_PAYMENT|          1|
|   25877|2014-01-01|             5549|PENDING_PAYMENT|          1|
|   25878|2014-01-01|             9084|        PENDING|          1|
|   25879|2014-01-01|             5118|        PENDING|          1|
|   25880|2014-01-01|            10146|       CANCELED|          1|
+--------+----------+-----------------+---------------+-----------+
only showing top 5 rows



In [26]:
spark.read.parquet("/user/itv011204/warehouse/itv011204_retail.db/orders_part3/order_year=2014").count()

38221

In [27]:
30662+38221

68883

In [30]:
spark.sql("show partitions orders_part3").show(truncate=False)

+------------------------------+
|partition                     |
+------------------------------+
|order_year=2013/order_month=07|
|order_year=2013/order_month=08|
|order_year=2013/order_month=09|
|order_year=2013/order_month=10|
|order_year=2013/order_month=11|
|order_year=2013/order_month=12|
|order_year=2014/order_month=01|
|order_year=2014/order_month=02|
|order_year=2014/order_month=03|
|order_year=2014/order_month=04|
|order_year=2014/order_month=05|
|order_year=2014/order_month=06|
|order_year=2014/order_month=07|
+------------------------------+



In [32]:
spark.catalog.setCurrentDatabase(f'{username}_airlines')

In [33]:
spark.catalog.listTables()

[Table(name='airport_codes', database='itv011204_airlines', description=None, tableType='MANAGED', isTemporary=False)]

In [39]:
airports_codes_path=f"/public/airlines_all/airport-codes"

In [38]:
!hdfs dfs -ls /public/airlines_all/airport-codes

Found 1 items
-rw-r--r--   2 hdfs supergroup      11411 2021-01-28 10:48 /public/airlines_all/airport-codes/airport-codes-na.txt


In [40]:
airport_codes_df = spark.read.csv(airports_codes_path, sep='\t',header=True, inferSchema=True)

In [41]:
airport_codes_df.createTempView("airport_codes_v")

In [42]:
spark.catalog.listTables()

[Table(name='airport_codes', database='itv011204_airlines', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='airport_codes_v', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [43]:
spark.sql("SHOW TABLES")

database,tableName,isTemporary
itv011204_airlines,airport_codes,False
,airport_codes_v,True


In [45]:
spark.sql("describe formatted airport_codes_v").show(truncate=False)

+--------+---------+-------+
|col_name|data_type|comment|
+--------+---------+-------+
|City    |string   |null   |
|State   |string   |null   |
|Country |string   |null   |
|IATA    |string   |null   |
+--------+---------+-------+



In [47]:
spark.sql("describe formatted airport_codes").show(truncate=False)

+----------------------------+------------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                                 |comment|
+----------------------------+------------------------------------------------------------------------------------------+-------+
|City                        |string                                                                                    |null   |
|State                       |string                                                                                    |null   |
|Country                     |string                                                                                    |null   |
|IATA                        |string                                                                                    |null   |
|                            |                                                            

In [48]:
!hdfs dfs -ls /user/`whoami`/warehouse/itv011204_airlines.db/

Found 1 items
drwxr-xr-x   - itv011204 supergroup          0 2024-02-15 09:15 /user/itv011204/warehouse/itv011204_airlines.db/airport_codes


In [None]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
        config('spark.sql.ui.port','0'). \
        enableHiveSupport(). \
        appName(f'{username} | Section 18 Exploring spark catalog'). \
        master('yarn'). \
        getOrCreate()

In [1]:
print("Hello")

Hello


In [None]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
        config('spark.sql.ui.port','0'). \
        config('spark.shuffle.io.connectionTimeout','6000'). \
        config('spark.driver.memory','6g'). \
        config('spark.executor.memory','6g'). \
        config('spark.dynamicAllocation.minExecutors', '4'). \
        enableHiveSupport(). \
        appName(f'{username} | Section 18 Exploring spark catalog'). \
        master('yarn'). \
        getOrCreate()

In [None]:
username

In [None]:
spark.conf.set('spark.sql.shuffle.partitions','2')

In [None]:
spark.catalog.setCurrentDatabase(f'{username}_retail')

In [None]:
spark.catalog.currentDatabase()

In [None]:
spark.sql('show tables')

In [None]:
spark.sql("DROP TABLE IF EXISTS orders")

In [None]:
spark.sql("""
    CREATE TABLE orders(
        order_id INT,
        order_date STRING,
        order_customer_id INT,
        order_status STRING
    ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
    """)

In [None]:
spark.sql("""
    LOAD DATA LOCAL INPATH '/data/retail_db/orders' INTO TABLE orders
""")

In [None]:
spark.sql("SHOW TABLES")

In [None]:
spark.sql("select count(1) from orders").show()

In [None]:
spark.sql("""
    CREATE TABLE order_items (
        order_item_id INT,
        order_item_order_id INT,
        order_item_product_id INT,
        order_item_quantity INT,
        order_item_subtotal FLOAT,
        order_item_product_price FLOAT
    ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
""")

In [None]:
spark.sql("""
    LOAD DATA LOCAL inPATH '/data/retail_db/order_items' into table order_items
""")

In [None]:
spark.sql("select * from order_items limit 5")

In [None]:
spark.sql("select * from orders limit 5")

In [None]:
spark.sql("select * , ROW_NUMBER() over (Order by order_id) as rowid from orders").show(15)

In [None]:
spark.sql("""
    select * from (
        SELECT * , ROW_NUMBER() over (Order by order_id) as rowid from orders
    )t
     WHERE rowid>5 AND rowid<11
""").show(5)

In [None]:
spark.sql("""
    select * from (
        SELECT * , ROW_NUMBER() over (Order by order_id) as rowid from orders WHERE order_status in ('COMPLETE','CLOSED')
    )t
     WHERE rowid>10 AND rowid<16
""").show(5)

In [None]:
products = spark.read.json('/public/retail_db_json/products')

In [None]:
products.printSchema()

In [None]:
products.show(5)

In [None]:
products.createTempView('products_v')

In [None]:
spark.sql("SHOW TABLES")

In [1]:
spark.sql("""
    SELECT o.order_date,
        p.product_id,
        p.product_name,
        round(sum(oi.order_item_subtotal),2) AS revenue
    FROM orders as o JOIN order_items as oi
        ON o.order_id = oi.order_item_order_id
    JOIN products_v as p
        ON p.product_id = oi.order_item_product_id
    WHERE o.order_status IN ('COMPLETE','CLOSED')
    GROUP BY o.order_date,
        p.product_id,
        p.product_name
    ORDER BY o.order_date, revenue DESC
""").show(5,truncate=False)

NameError: name 'spark' is not defined