## Section 14 Apache spark using PYthon

### 158 Starting Spark

In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

In [2]:
username

'itv011204'

In [3]:
spark = SparkSession. \
        builder. \
        config("spark.ui.port","0"). \
        config("spark.sql.warehouse.dir",f"/user/{username}/warehouse"). \
        enableHiveSupport(). \
        appName(f"{username} | Python - Data processing - Overview"). \
        master("yarn"). \
        getOrCreate()

In [4]:
spark.version

'3.1.2'

In [5]:
spark

In [20]:
spark.read

<pyspark.sql.readwriter.DataFrameReader at 0x7f3c678a96d8>

In [None]:
spark.read.csv?

In [None]:
help(spark.read.csv)

In [8]:
spark. \
    read. \
    csv('/public/retail_db/orders',
        schema='''
            order_id INT, 
            order_date STRING, 
            order_customer_id INT, 
            order_status STRING
        '''
       ). \
    show()

+--------+--------------------+-----------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|
+--------+--------------------+-----------------+---------------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|
|       6|2013-07-25 00:00:...|             7130|       COMPLETE|
|       7|2013-07-25 00:00:...|             4530|       COMPLETE|
|       8|2013-07-25 00:00:...|             2911|     PROCESSING|
|       9|2013-07-25 00:00:...|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|             1837|         CLOSED|
|      13|

In [9]:
spark. \
    read. \
    json('/public/retail_db_json/orders'). \
    show()

+-----------------+--------------------+--------+---------------+
|order_customer_id|          order_date|order_id|   order_status|
+-----------------+--------------------+--------+---------------+
|            11599|2013-07-25 00:00:...|       1|         CLOSED|
|              256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|            12111|2013-07-25 00:00:...|       3|       COMPLETE|
|             8827|2013-07-25 00:00:...|       4|         CLOSED|
|            11318|2013-07-25 00:00:...|       5|       COMPLETE|
|             7130|2013-07-25 00:00:...|       6|       COMPLETE|
|             4530|2013-07-25 00:00:...|       7|       COMPLETE|
|             2911|2013-07-25 00:00:...|       8|     PROCESSING|
|             5657|2013-07-25 00:00:...|       9|PENDING_PAYMENT|
|             5648|2013-07-25 00:00:...|      10|PENDING_PAYMENT|
|              918|2013-07-25 00:00:...|      11| PAYMENT_REVIEW|
|             1837|2013-07-25 00:00:...|      12|         CLOSED|
|         

### 160 Understanding Airlines data

In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

In [2]:
username

'itv011204'

In [3]:
spark = SparkSession. \
    builder. \
    config("spark.ui.port","0"). \
    config("spark.sql.warehouse.dir",f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

In [13]:
spark

In [None]:
%%sh

hdfs dfs -ls /public/airlines_all/airlines

In [19]:
%%sh

hdfs dfs -ls -h /public/airlines_all/airlines/part-00000

-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 08:56 /public/airlines_all/airlines/part-00000


In [20]:
airlines = spark.read. \
    text("/public/airlines_all/airlines/part-00000")

In [21]:
type(airlines)

pyspark.sql.dataframe.DataFrame

In [22]:
help(airlines.show)

Help on method show in module pyspark.sql.dataframe:

show(n=20, truncate=True, vertical=False) method of pyspark.sql.dataframe.DataFrame instance
    Prints the first ``n`` rows to the console.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    n : int, optional
        Number of rows to show.
    truncate : bool, optional
        If set to ``True``, truncate strings longer than 20 chars by default.
        If set to a number greater than one, truncates long strings to length ``truncate``
        and align cells right.
    vertical : bool, optional
        If set to ``True``, print output rows vertically (one line
        per column value).
    
    Examples
    --------
    >>> df
    DataFrame[age: int, name: string]
    >>> df.show()
    +---+-----+
    |age| name|
    +---+-----+
    |  2|Alice|
    |  5|  Bob|
    +---+-----+
    >>> df.show(truncate=3)
    +---+----+
    |age|name|
    +---+----+
    |  2| Ali|
    |  5| Bob|
    +---+----+
    >>> df.show(v

In [23]:
airlines.show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                                                                                                                                                |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Year,Month,Dayo

### 161 Inferring Schema using Spark DataFrame APIs

In [5]:
airlines_part_00000 = spark.read. \
    csv("/public/airlines_all/airlines/part-00000",
        header=True,
        inferSchema=True)

In [6]:
type(airlines_part_00000)

pyspark.sql.dataframe.DataFrame

In [7]:
airlines_part_00000.show(truncate=False)

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|IsArrDelayed|IsDepDelayed|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|1987|10   |14  

In [8]:
airlines_part_00000.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- Car

In [9]:
airlines_part_00000.schema

StructType(List(StructField(Year,IntegerType,true),StructField(Month,IntegerType,true),StructField(DayofMonth,IntegerType,true),StructField(DayOfWeek,IntegerType,true),StructField(DepTime,StringType,true),StructField(CRSDepTime,IntegerType,true),StructField(ArrTime,StringType,true),StructField(CRSArrTime,IntegerType,true),StructField(UniqueCarrier,StringType,true),StructField(FlightNum,IntegerType,true),StructField(TailNum,StringType,true),StructField(ActualElapsedTime,StringType,true),StructField(CRSElapsedTime,IntegerType,true),StructField(AirTime,StringType,true),StructField(ArrDelay,StringType,true),StructField(DepDelay,StringType,true),StructField(Origin,StringType,true),StructField(Dest,StringType,true),StructField(Distance,StringType,true),StructField(TaxiIn,StringType,true),StructField(TaxiOut,StringType,true),StructField(Cancelled,IntegerType,true),StructField(CancellationCode,StringType,true),StructField(Diverted,IntegerType,true),StructField(CarrierDelay,StringType,true),Str

In [10]:
type(airlines_part_00000.schema)

pyspark.sql.types.StructType

In [11]:
airlines_schema = spark.read. \
    csv("/public/airlines_all/airlines/part-00000",
       header=True,
       inferSchema=True). \
    schema

In [12]:
type(airlines_schema)

pyspark.sql.types.StructType

In [16]:
help(spark.read.csv)

Help on method csv in module pyspark.sql.readwriter:

csv(path, schema=None, sep=None, encoding=None, quote=None, escape=None, comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None, maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, samplingRatio=None, enforceSchema=None, emptyValue=None, locale=None, lineSep=None, pathGlobFilter=None, recursiveFileLookup=None, modifiedBefore=None, modifiedAfter=None, unescapedQuoteHandling=None) method of pyspark.sql.readwriter.DataFrameReader instance
    Loads a CSV file and returns the result as a  :class:`DataFrame`.
    
    This function will go through the input once to determine the input schema if
    ``inferSchema`` is enabled. To avoid going through the entire data once, di

In [17]:
airlines = spark.read. \
    schema(airlines_schema). \
    csv("/public/airlines_all/airlines/part*",
           header=True
       )

In [18]:
airlines = spark.read. \
    csv("/public/airlines_all/airlines/part*",
           header=True,
           schema=airlines_schema
       )

In [19]:
help(airlines)

Help on DataFrame in module pyspark.sql.dataframe object:

class DataFrame(pyspark.sql.pandas.map_ops.PandasMapOpsMixin, pyspark.sql.pandas.conversion.PandasConversionMixin)
 |  A distributed collection of data grouped into named columns.
 |  
 |  A :class:`DataFrame` is equivalent to a relational table in Spark SQL,
 |  and can be created using various functions in :class:`SparkSession`::
 |  
 |      people = spark.read.parquet("...")
 |  
 |  Once created, it can be manipulated using the various domain-specific-language
 |  (DSL) functions defined in: :class:`DataFrame`, :class:`Column`.
 |  
 |  To select a column from the :class:`DataFrame`, use the apply method::
 |  
 |      ageCol = people.age
 |  
 |  A more concrete example::
 |  
 |      # To create DataFrame using SparkSession
 |      people = spark.read.parquet("...")
 |      department = spark.read.parquet("...")
 |  
 |      people.filter(people.age > 30).join(department, people.deptId == department.id) \
 |        .grou

In [20]:
airlines.count()

KeyboardInterrupt: 

In [21]:
airlines.show()

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|IsArrDelayed|IsDepDelayed|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|2001|    8|    

In [23]:
airlines.schema

StructType(List(StructField(Year,IntegerType,true),StructField(Month,IntegerType,true),StructField(DayofMonth,IntegerType,true),StructField(DayOfWeek,IntegerType,true),StructField(DepTime,StringType,true),StructField(CRSDepTime,IntegerType,true),StructField(ArrTime,StringType,true),StructField(CRSArrTime,IntegerType,true),StructField(UniqueCarrier,StringType,true),StructField(FlightNum,IntegerType,true),StructField(TailNum,StringType,true),StructField(ActualElapsedTime,StringType,true),StructField(CRSElapsedTime,IntegerType,true),StructField(AirTime,StringType,true),StructField(ArrDelay,StringType,true),StructField(DepDelay,StringType,true),StructField(Origin,StringType,true),StructField(Dest,StringType,true),StructField(Distance,StringType,true),StructField(TaxiIn,StringType,true),StructField(TaxiOut,StringType,true),StructField(Cancelled,IntegerType,true),StructField(CancellationCode,StringType,true),StructField(Diverted,IntegerType,true),StructField(CarrierDelay,StringType,true),Str

### 162 Previewing Airlings Data using Spark Data Frame APIs

In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

In [2]:
spark = SparkSession. \
    builder. \
    config("spark.ui.port","0"). \
    config("spark.sql.warehouse.dir",f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

In [4]:
airlines_schema = spark.read. \
    csv("/public/airlines_all/airlines/part-00000",
       header=True,
       inferSchema=True). \
    schema

In [5]:
airlines = spark.read. \
    csv("/public/airlines_all/airlines/part-0000*",
           header=True,
           schema=airlines_schema
       )

In [6]:
airlines.show(10, truncate=False)

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|IsArrDelayed|IsDepDelayed|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|1987|12   |20  

In [7]:
airlines.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- Car

In [9]:
airlines.count()

6489231

In [None]:
airlines.distinct().count()

### 163  Overview of DataFrame APIs

In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

In [2]:
employees = [(1, "Scott", "Tiger", 1000.0, "united states"),
             (2, "Henry", "Ford", 1250.0, "India"),
             (3, "Nick", "Junior", 750.0, "united KINGDOM"),
             (4, "Bill", "Gomes", 1500.0, "AUSTRALIA")
            ]

In [3]:
type(employees)

list

In [4]:
employees[0]

(1, 'Scott', 'Tiger', 1000.0, 'united states')

In [5]:
type(employees[0])

tuple

In [6]:
spark.createDataFrame?

[0;31mSignature:[0m
[0mspark[0m[0;34m.[0m[0mcreateDataFrame[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdata[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mschema[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msamplingRatio[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverifySchema[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.

When ``schema`` is a list of column names, the type of each column
will be inferred from ``data``.

When ``schema`` is ``None``, it will try to infer the schema (column names and types)
from ``data``, which should be an RDD of either :class:`Row`,
:class:`namedtuple`, or :class:`dict`.

When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
the real data, or an exception will be thrown at runtime. If

In [8]:
employeesDF = spark.createDataFrame(
    employees,
    schema = """employee_id INT, first_name STRING,
                last_name STRING, salary FLOAT, nationality STRING"""
    )

In [9]:
employeesDF.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- salary: float (nullable = true)
 |-- nationality: string (nullable = true)



In [10]:
employeesDF.count()

4

In [12]:
employeesDF.show()

+-----------+----------+---------+------+--------------+
|employee_id|first_name|last_name|salary|   nationality|
+-----------+----------+---------+------+--------------+
|          1|     Scott|    Tiger|1000.0| united states|
|          2|     Henry|     Ford|1250.0|         India|
|          3|      Nick|   Junior| 750.0|united KINGDOM|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|
+-----------+----------+---------+------+--------------+



In [13]:
employeesDF

employee_id,first_name,last_name,salary,nationality
1,Scott,Tiger,1000.0,united states
2,Henry,Ford,1250.0,India
3,Nick,Junior,750.0,united KINGDOM
4,Bill,Gomes,1500.0,AUSTRALIA


In [14]:
employeesDF.select("*")

employee_id,first_name,last_name,salary,nationality
1,Scott,Tiger,1000.0,united states
2,Henry,Ford,1250.0,India
3,Nick,Junior,750.0,united KINGDOM
4,Bill,Gomes,1500.0,AUSTRALIA


In [15]:
employeesDF.select("*").show()

+-----------+----------+---------+------+--------------+
|employee_id|first_name|last_name|salary|   nationality|
+-----------+----------+---------+------+--------------+
|          1|     Scott|    Tiger|1000.0| united states|
|          2|     Henry|     Ford|1250.0|         India|
|          3|      Nick|   Junior| 750.0|united KINGDOM|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|
+-----------+----------+---------+------+--------------+



In [18]:
employeesDF.select("first_name","last_name").show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|     Scott|    Tiger|
|     Henry|     Ford|
|      Nick|   Junior|
|      Bill|    Gomes|
+----------+---------+



In [19]:
employeesDF.drop("nationality").show()

+-----------+----------+---------+------+
|employee_id|first_name|last_name|salary|
+-----------+----------+---------+------+
|          1|     Scott|    Tiger|1000.0|
|          2|     Henry|     Ford|1250.0|
|          3|      Nick|   Junior| 750.0|
|          4|      Bill|    Gomes|1500.0|
+-----------+----------+---------+------+



In [20]:
from pyspark.sql.functions import *

In [21]:
employeesDF.withColumn('full_name',concat('first_name',lit(' '), 'last_name'))

employee_id,first_name,last_name,salary,nationality,full_name
1,Scott,Tiger,1000.0,united states,Scott Tiger
2,Henry,Ford,1250.0,India,Henry Ford
3,Nick,Junior,750.0,united KINGDOM,Nick Junior
4,Bill,Gomes,1500.0,AUSTRALIA,Bill Gomes


In [22]:
employeesDF.withColumn('full_name',concat('first_name',lit(' '), 'last_name')).show()

+-----------+----------+---------+------+--------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|  full_name|
+-----------+----------+---------+------+--------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states|Scott Tiger|
|          2|     Henry|     Ford|1250.0|         India| Henry Ford|
|          3|      Nick|   Junior| 750.0|united KINGDOM|Nick Junior|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA| Bill Gomes|
+-----------+----------+---------+------+--------------+-----------+



In [24]:
employeesDF.selectExpr('*','concat(first_name, " ", last_name) AS full_name').show()

+-----------+----------+---------+------+--------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|  full_name|
+-----------+----------+---------+------+--------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states|Scott Tiger|
|          2|     Henry|     Ford|1250.0|         India| Henry Ford|
|          3|      Nick|   Junior| 750.0|united KINGDOM|Nick Junior|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA| Bill Gomes|
+-----------+----------+---------+------+--------------+-----------+



In [25]:
employeesDF.selectExpr('*','concat(first_name, " ", last_name) AS full_name')

employee_id,first_name,last_name,salary,nationality,full_name
1,Scott,Tiger,1000.0,united states,Scott Tiger
2,Henry,Ford,1250.0,India,Henry Ford
3,Nick,Junior,750.0,united KINGDOM,Nick Junior
4,Bill,Gomes,1500.0,AUSTRALIA,Bill Gomes


### 164 Functions on DataFrames

In [1]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', '/user/{username}/warehouse'). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Data Processing - Overview'). \
        master('yarn'). \
        getOrCreate()

In [12]:
employees = [(1, "Scott", "Tiger", 1000.0, "united states"),
             (2, "Henry", "Ford", 1250.0, "India"),
             (3, "Nick", "Junior", 750.0, "united KINGDOM"),
             (4, "Bill", "Gomes", 1500.0, "AUSTRALIA")
            ]

In [13]:
employeesDF = spark. \
    createDataFrame(employees,
                    schema = """employee_id INT, first_name STRING, 
                    last_name STRING, salary FLOAT, 
                    nationality STRING"""
                                       )

In [14]:
employeesDF

employee_id,first_name,last_name,salary,nationality
1,Scott,Tiger,1000.0,united states
2,Henry,Ford,1250.0,India
3,Nick,Junior,750.0,united KINGDOM
4,Bill,Gomes,1500.0,AUSTRALIA


In [5]:
employeesDF.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- salary: float (nullable = true)
 |-- nationality: string (nullable = true)



In [6]:
employeesDF.show()

+-----------+----------+---------+------+--------------+
|employee_id|first_name|last_name|salary|   nationality|
+-----------+----------+---------+------+--------------+
|          1|     Scott|    Tiger|1000.0| united states|
|          2|     Henry|     Ford|1250.0|         India|
|          3|      Nick|   Junior| 750.0|united KINGDOM|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|
+-----------+----------+---------+------+--------------+



In [16]:
from pyspark.sql.functions import lit, concat

employeesDF.withColumn("full_name",concat("first_name",lit(", "), "last_name")). \
    drop('first_name','last_name'). \
    show()

+-----------+------+--------------+------------+
|employee_id|salary|   nationality|   full_name|
+-----------+------+--------------+------------+
|          1|1000.0| united states|Scott, Tiger|
|          2|1250.0|         India| Henry, Ford|
|          3| 750.0|united KINGDOM|Nick, Junior|
|          4|1500.0|     AUSTRALIA| Bill, Gomes|
+-----------+------+--------------+------------+



In [17]:
employeesDF.select("employee_id",concat("first_name",lit(", "),"last_name"),"salary","nationality").show()

+-----------+---------------------------------+------+--------------+
|employee_id|concat(first_name, , , last_name)|salary|   nationality|
+-----------+---------------------------------+------+--------------+
|          1|                     Scott, Tiger|1000.0| united states|
|          2|                      Henry, Ford|1250.0|         India|
|          3|                     Nick, Junior| 750.0|united KINGDOM|
|          4|                      Bill, Gomes|1500.0|     AUSTRALIA|
+-----------+---------------------------------+------+--------------+



In [18]:
employeesDF.select("employee_id",concat("first_name",lit(","),"last_name").alias("full_name"),"salary","nationality").show()

+-----------+-----------+------+--------------+
|employee_id|  full_name|salary|   nationality|
+-----------+-----------+------+--------------+
|          1|Scott,Tiger|1000.0| united states|
|          2| Henry,Ford|1250.0|         India|
|          3|Nick,Junior| 750.0|united KINGDOM|
|          4| Bill,Gomes|1500.0|     AUSTRALIA|
+-----------+-----------+------+--------------+



In [25]:
employeesDF.selectExpr("employee_id",
                   "concat(first_name, ', ', last_name) AS full_name",
                   "salary",
                   "nationality"). \
        show()

+-----------+------------+------+--------------+
|employee_id|   full_name|salary|   nationality|
+-----------+------------+------+--------------+
|          1|Scott, Tiger|1000.0| united states|
|          2| Henry, Ford|1250.0|         India|
|          3|Nick, Junior| 750.0|united KINGDOM|
|          4| Bill, Gomes|1500.0|     AUSTRALIA|
+-----------+------------+------+--------------+



### 165 Overview of Spark Write APIs

In [1]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', '/user/{username}/warehouse'). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Data Processing - Overview'). \
        master('yarn'). \
        getOrCreate()

In [None]:
%%sh

hdfs dfs -rm -R -skipTrash /user/${USER}/retail_db

In [3]:
orders = spark. \
    read. \
    csv('/public/retail_db/orders',
           schema='''
           order_id INT, order_date STRING,
           order_customer_id INT,
           order_status STRING
           ''')

In [4]:
orders.count()

68883

In [5]:
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [6]:
orders.show(5)

+--------+--------------------+-----------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|
+--------+--------------------+-----------------+---------------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|
+--------+--------------------+-----------------+---------------+
only showing top 5 rows



In [7]:
orders. \
    write. \
    parquet(f'/user/{username}/retail_db/orders',
           mode='overwrite',
           compression='none'
           )

In [9]:
%%sh

hdfs dfs -ls /user/${USER}/retail_db/orders

Found 2 items
-rw-r--r--   3 itv011204 supergroup          0 2024-02-06 01:57 /user/itv011204/retail_db/orders/_SUCCESS
-rw-r--r--   3 itv011204 supergroup     495238 2024-02-06 01:57 /user/itv011204/retail_db/orders/part-00000-a9a3470e-08a5-42d3-9438-5038540d7952-c000.parquet


In [15]:
employeesDF.selectExpr("employee_id",
                   "concat(first_name, ', ', last_name) AS full_name",
                   "salary",
                   "nationality"). \
            write. \
            parquet(f'/user/{username}/retail_db/employees',
            mode='overwrite',
            compression='none'
           )

In [16]:
%%sh

hdfs dfs -ls /user/${USER}/retail_db/employees

Found 3 items
-rw-r--r--   3 itv011204 supergroup          0 2024-02-06 02:00 /user/itv011204/retail_db/employees/_SUCCESS
-rw-r--r--   3 itv011204 supergroup       1157 2024-02-06 02:00 /user/itv011204/retail_db/employees/part-00000-f707a78c-b602-40a8-bdac-9e3695181dc5-c000.parquet
-rw-r--r--   3 itv011204 supergroup       1172 2024-02-06 02:00 /user/itv011204/retail_db/employees/part-00001-f707a78c-b602-40a8-bdac-9e3695181dc5-c000.parquet


In [17]:
orders. \
    write. \
    mode('overwrite'). \
    option('compression','none'). \
    parquet(f'/user/{username}/retail_db/orders')

In [18]:
%%sh

hdfs dfs -ls /user/${USER}/retail_db/orders

Found 2 items
-rw-r--r--   3 itv011204 supergroup          0 2024-02-06 02:02 /user/itv011204/retail_db/orders/_SUCCESS
-rw-r--r--   3 itv011204 supergroup     495238 2024-02-06 02:02 /user/itv011204/retail_db/orders/part-00000-a681d504-fb58-47b4-8c41-17e84ba3f44a-c000.parquet


In [20]:
orders. \
    write. \
    mode('overwrite').\
    option('compression', 'none'). \
    format('parquet'). \
    save(f'/user/{username}/retail_db/orders')

In [21]:
%%sh

hdfs dfs -ls /user/${USER}/retail_db/orders

Found 2 items
-rw-r--r--   3 itv011204 supergroup          0 2024-02-06 02:06 /user/itv011204/retail_db/orders/_SUCCESS
-rw-r--r--   3 itv011204 supergroup     495238 2024-02-06 02:06 /user/itv011204/retail_db/orders/part-00000-b6a9b40b-b9a5-47c2-a5e0-0cf13e1aee81-c000.parquet


In [22]:
order_items = spark. \
                read. \
                json('/public/retail_db_json/order_items')

In [23]:
order_items.count()

172198

In [24]:
order_items.show(5)

+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_product_price|order_item_quantity|order_item_subtotal|
+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|            1|                  1|                  957|                  299.98|                  1|             299.98|
|            2|                  2|                 1073|                  199.99|                  1|             199.99|
|            3|                  2|                  502|                    50.0|                  5|              250.0|
|            4|                  2|                  403|                  129.99|                  1|             129.99|
|            5|                  4|                  897|                   24.99|                  2|              49.98|
+-------------+-

In [25]:
order_items.printSchema()

root
 |-- order_item_id: long (nullable = true)
 |-- order_item_order_id: long (nullable = true)
 |-- order_item_product_id: long (nullable = true)
 |-- order_item_product_price: double (nullable = true)
 |-- order_item_quantity: long (nullable = true)
 |-- order_item_subtotal: double (nullable = true)



In [26]:
order_items. \
    coalesce(1). \
    write. \
    mode('ignore'). \
    option('compression', 'gzip'). \
    option('sep','|'). \
    format('csv'). \
    save(f'/user/{username}/retail_db/order_items')

In [27]:
%%sh

hdfs dfs -ls /user/${USER}/retail_db/order_items

Found 2 items
-rw-r--r--   3 itv011204 supergroup          0 2024-02-06 02:12 /user/itv011204/retail_db/order_items/_SUCCESS
-rw-r--r--   3 itv011204 supergroup    1032820 2024-02-06 02:12 /user/itv011204/retail_db/order_items/part-00000-d3794630-189b-43e2-956e-03bf04eb1a36-c000.csv.gz


In [32]:
order_items. \
    coalesce(1). \
    write. \
    csv(f'/user/{username}/retail_db/order_items',
        sep='|',
        mode='overwrite',
        compression='gzip'
       )

In [33]:
%%sh

hdfs dfs -ls /user/${USER}/retail_db/order_items

Found 2 items
-rw-r--r--   3 itv011204 supergroup          0 2024-02-06 02:26 /user/itv011204/retail_db/order_items/_SUCCESS
-rw-r--r--   3 itv011204 supergroup    1032820 2024-02-06 02:26 /user/itv011204/retail_db/order_items/part-00000-a1395079-89c3-408e-b91f-583c788062f5-c000.csv.gz


In [34]:
order_items. \
    coalesce(1). \
    write. \
    csv(f'/user/{username}/retail_db/order_items',
        sep='|',
        mode='ignore',
        compression='gzip'
       )

In [35]:
%%sh

hdfs dfs -ls /user/${USER}/retail_db/order_items

Found 2 items
-rw-r--r--   3 itv011204 supergroup          0 2024-02-06 02:26 /user/itv011204/retail_db/order_items/_SUCCESS
-rw-r--r--   3 itv011204 supergroup    1032820 2024-02-06 02:26 /user/itv011204/retail_db/order_items/part-00000-a1395079-89c3-408e-b91f-583c788062f5-c000.csv.gz


In [36]:
order_items. \
    coalesce(1). \
    write. \
    csv(f'/user/{username}/retail_db/order_items',
        sep='|',
        mode='append',
        compression='gzip'
       )

In [37]:
%%sh

hdfs dfs -ls /user/${USER}/retail_db/order_items

Found 3 items
-rw-r--r--   3 itv011204 supergroup          0 2024-02-06 02:27 /user/itv011204/retail_db/order_items/_SUCCESS
-rw-r--r--   3 itv011204 supergroup    1032820 2024-02-06 02:27 /user/itv011204/retail_db/order_items/part-00000-0cb24dd5-c392-4bc7-bc18-075bfacc5f31-c000.csv.gz
-rw-r--r--   3 itv011204 supergroup    1032820 2024-02-06 02:26 /user/itv011204/retail_db/order_items/part-00000-a1395079-89c3-408e-b91f-583c788062f5-c000.csv.gz


In [38]:
order_items. \
    coalesce(1). \
    write. \
    csv(f'/user/{username}/retail_db/order_items',
        sep='|',
        mode='error',
        compression='gzip'
       )

AnalysisException: path hdfs://m01.itversity.com:9000/user/itv011204/retail_db/order_items already exists.

In [39]:
order_items. \
    coalesce(1). \
    write. \
    csv(f'/user/{username}/retail_db/order_items',
        sep='|',
        mode='overwrite',
        compression='gzip'
       )

In [40]:
%%sh

hdfs dfs -ls /user/${USER}/retail_db/order_items

Found 2 items
-rw-r--r--   3 itv011204 supergroup          0 2024-02-06 02:28 /user/itv011204/retail_db/order_items/_SUCCESS
-rw-r--r--   3 itv011204 supergroup    1032820 2024-02-06 02:28 /user/itv011204/retail_db/order_items/part-00000-1d4e3b93-b41b-4ccb-9d88-be4c32edbd05-c000.csv.gz
