## Section 15 Pyspark Processing Column Data

In [1]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', '/user/{username}/warehouse'). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Data Processing - Overview'). \
        master('yarn'). \
        getOrCreate()

In [2]:
from pyspark.sql import functions

In [None]:
help(functions)

In [5]:
orders= spark.read.csv(
    '/public/retail_db/orders',
    schema='order_id INT, order_date STRING, order_customer_id INT, order_status STRING'
)

In [3]:
from pyspark.sql.functions import date_format

In [7]:
orders.show(2)

+--------+--------------------+-----------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|
+--------+--------------------+-----------------+---------------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|
+--------+--------------------+-----------------+---------------+
only showing top 2 rows



In [8]:
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [9]:
orders.select('*', date_format('order_date','yyyyMM').alias('order_month')).show(2)

+--------+--------------------+-----------------+---------------+-----------+
|order_id|          order_date|order_customer_id|   order_status|order_month|
+--------+--------------------+-----------------+---------------+-----------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|     201307|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|     201307|
+--------+--------------------+-----------------+---------------+-----------+
only showing top 2 rows



In [12]:
orders.withColumn('order_month', date_format('order_date','yyyyMM')).show(2)

+--------+--------------------+-----------------+---------------+-----------+
|order_id|          order_date|order_customer_id|   order_status|order_month|
+--------+--------------------+-----------------+---------------+-----------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|     201307|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|     201307|
+--------+--------------------+-----------------+---------------+-----------+
only showing top 2 rows



In [20]:
# Filter

orders.filter(date_format('order_date','yyyyMM')==201401).show(2)

+--------+--------------------+-----------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|
+--------+--------------------+-----------------+---------------+
|   25876|2014-01-01 00:00:...|             3414|PENDING_PAYMENT|
|   25877|2014-01-01 00:00:...|             5549|PENDING_PAYMENT|
+--------+--------------------+-----------------+---------------+
only showing top 2 rows



In [23]:
# groupBy

orders.groupBy(date_format('order_date','yyyyMM').alias('order_month')). \
    count(). \
    show(2)

+-----------+-----+
|order_month|count|
+-----------+-----+
|     201401| 5908|
|     201405| 5467|
+-----------+-----+
only showing top 2 rows



### 167 Create Dummy Data Frame to explore Functions

In [2]:
l = [('X',)]

In [3]:
df = spark.createDataFrame(l, "dummy STRING")

In [6]:
df.printSchema()

root
 |-- dummy: string (nullable = true)



In [7]:
df.show()

+-----+
|dummy|
+-----+
|    X|
+-----+



In [9]:
from pyspark.sql.functions import current_date

df.select(current_date()).show()

+--------------+
|current_date()|
+--------------+
|    2024-02-06|
+--------------+



In [10]:
df.select(current_date().alias("current_date")).show()

+------------+
|current_date|
+------------+
|  2024-02-06|
+------------+



In [4]:
employees = [
    (1, "Scott", "Tiger", 1000.0, 
      "united states", "+1 123 456 7890", "123 45 6789"
    ),
     (2, "Henry", "Ford", 1250.0, 
      "India", "+91 234 567 8901", "456 78 9123"
     ),
     (3, "Nick", "Junior", 750.0, 
      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
     ),
     (4, "Bill", "Gomes", 1500.0, 
      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
     )
]

In [5]:
len(employees)

4

In [6]:
employeesDF = spark.createDataFrame(employees,
            schema="""employee_id INT, first_name STRING, last_name STRING,
            salary FLOAT, nationality STRING,
            phone STRING, ssn STRING """)


In [7]:
employeesDF.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- salary: float (nullable = true)
 |-- nationality: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- ssn: string (nullable = true)



In [8]:
employeesDF.show(truncate=False)

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|nationality   |phone           |ssn        |
+-----------+----------+---------+------+--------------+----------------+-----------+
|1          |Scott     |Tiger    |1000.0|united states |+1 123 456 7890 |123 45 6789|
|2          |Henry     |Ford     |1250.0|India         |+91 234 567 8901|456 78 9123|
|3          |Nick      |Junior   |750.0 |united KINGDOM|+44 111 111 1111|222 33 4444|
|4          |Bill      |Gomes    |1500.0|AUSTRALIA     |+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+--------------+----------------+-----------+



### Categories of Predefined functions used in Spark DataFrame columns

There are approximately 300 functions under pyspark.sql.functions. At a higher level they can be grouped into a few categories.

* String Manipulation Functions

    * Case Conversion - lower, upper
    * Getting Length - length
    * Extracting substrings - substring, split
    * Trimming - trim, ltrim, rtrim
    * Padding - lpad, rpad
    * Concatenating string - concat, concat_ws

* Date Manipulation Functions

    * Getting current date and time - current_date, current_timestamp
    * Date Arithmetic - date_add, date_sub, datediff, months_between, add_months, next_day
    * Beginning and Ending Date or Time - last_day, trunc, date_trunc
    * Formatting Date - date_format
    * Extracting Information - dayofyear, dayofmonth, dayofweek, year, month

* Aggregate Functions

    * count, countDistinct
    * sum, avg
    * min, max

* Other Functions - We will explore depending on the use cases.

    * CASE and WHEN
    * CAST for type casting
    * Functions to manage special types such as ARRAY, MAP, STRUCT type columns

* Many others


In [17]:
### 169 Col and lit

In [2]:
employees = [
    (1, "Scott", "Tiger", 1000.0, 
      "united states", "+1 123 456 7890", "123 45 6789"
    ),
     (2, "Henry", "Ford", 1250.0, 
      "India", "+91 234 567 8901", "456 78 9123"
     ),
     (3, "Nick", "Junior", 750.0, 
      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
     ),
     (4, "Bill", "Gomes", 1500.0, 
      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
     )
]

In [3]:
employeesDF = spark.createDataFrame(employees,
            schema="""employee_id INT, first_name STRING, last_name STRING,
            salary FLOAT, nationality STRING,
            phone STRING, ssn STRING """)


In [4]:
employeesDF. \
    select("first_name", "last_name").show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|     Scott|    Tiger|
|     Henry|     Ford|
|      Nick|   Junior|
|      Bill|    Gomes|
+----------+---------+



In [5]:
employeesDF. \
    groupBy("nationality").count().show()

+--------------+-----+
|   nationality|count|
+--------------+-----+
|         India|    1|
|united KINGDOM|    1|
| united states|    1|
|     AUSTRALIA|    1|
+--------------+-----+



In [6]:
employeesDF. \
    orderBy("employee_id").show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|           phone|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [13]:
from pyspark.sql.functions import col

In [8]:
employeesDF. \
    select(col("first_name"), col("last_name")). \
    show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|     Scott|    Tiger|
|     Henry|     Ford|
|      Nick|   Junior|
|      Bill|    Gomes|
+----------+---------+



In [10]:
from pyspark.sql.functions import upper

In [10]:
employeesDF. \
    select(upper("first_name"), upper("last_name") ). \
    show()

+-----------------+----------------+
|upper(first_name)|upper(last_name)|
+-----------------+----------------+
|            SCOTT|           TIGER|
|            HENRY|            FORD|
|             NICK|          JUNIOR|
|             BILL|           GOMES|
+-----------------+----------------+



In [11]:
employeesDF. \
    select(upper(col("first_name")), upper(col("last_name"))). \
    show()

+-----------------+----------------+
|upper(first_name)|upper(last_name)|
+-----------------+----------------+
|            SCOTT|           TIGER|
|            HENRY|            FORD|
|             NICK|          JUNIOR|
|             BILL|           GOMES|
+-----------------+----------------+



In [13]:
employeesDF. \
    groupBy(upper(col("nationality"))). \
    count(). \
    show()

+------------------+-----+
|upper(nationality)|count|
+------------------+-----+
|    UNITED KINGDOM|    1|
|             INDIA|    1|
|         AUSTRALIA|    1|
|     UNITED STATES|    1|
+------------------+-----+



In [11]:
from pyspark.sql.functions import desc

employeesDF. \
    orderBy("employee_id".desc()). \
    show()

AttributeError: 'str' object has no attribute 'desc'

In [14]:
employeesDF. \
    orderBy(col("employee_id").desc()). \
    show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|           phone|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [15]:
employeesDF. \
    orderBy(col("first_name").desc()). \
    show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|           phone|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [16]:
employeesDF. \
    orderBy(employeesDF['first_name'].alias('first_name')). \
    show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|           phone|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [20]:
employeesDF. \
    orderBy(employeesDF['first_name'].alias('first_name').desc()). \
    show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|           phone|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [18]:
employeesDF. \
    orderBy(upper(employeesDF['first_name']).alias('first_name').desc()). \
    show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|           phone|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [19]:
employeesDF. \
    select("*"). \
    orderBy(upper(employeesDF['first_name']).alias('first_name').desc()). \
    show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|           phone|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [21]:
employeesDF. \
    select("employee_id","first_name", "last_name"). \
    orderBy(upper(employeesDF['first_name']).alias('first_name').desc()). \
    show()

+-----------+----------+---------+
|employee_id|first_name|last_name|
+-----------+----------+---------+
|          1|     Scott|    Tiger|
|          3|      Nick|   Junior|
|          2|     Henry|     Ford|
|          4|      Bill|    Gomes|
+-----------+----------+---------+



In [22]:
from pyspark.sql.functions import concat

In [23]:
employeesDF. \
    select(concat(col("first_name"), ", ", col("last_name"))). \
    show()

AnalysisException: cannot resolve '`, `' given input columns: [employee_id, first_name, last_name, nationality, phone, salary, ssn];
'Project [concat(first_name#1, ', , last_name#2) AS concat(first_name, , , last_name)#282]
+- LogicalRDD [employee_id#0, first_name#1, last_name#2, salary#3, nationality#4, phone#5, ssn#6], false


In [26]:
employeesDF. \
    select(concat(employeesDF["first_name"], ", ", employeesDF["last_name"])). \
    show()

AnalysisException: cannot resolve '`, `' given input columns: [employee_id, first_name, last_name, nationality, phone, salary, ssn];
'Project [concat(first_name#1, ', , last_name#2) AS concat(first_name, , , last_name)#290]
+- LogicalRDD [employee_id#0, first_name#1, last_name#2, salary#3, nationality#4, phone#5, ssn#6], false


In [24]:
from pyspark.sql.functions import lit

In [25]:
employeesDF. \
    select(concat(col("first_name"), lit(", "), col("last_name"))). \
    show()

+---------------------------------+
|concat(first_name, , , last_name)|
+---------------------------------+
|                     Scott, Tiger|
|                      Henry, Ford|
|                     Nick, Junior|
|                      Bill, Gomes|
+---------------------------------+



In [28]:
from pyspark.sql.functions import concat, col, lit

employeesDF. \
    select(concat(col("first_name"), lit(", "), col("last_name")).alias("full_name")). \
    show()

+------------+
|   full_name|
+------------+
|Scott, Tiger|
| Henry, Ford|
|Nick, Junior|
| Bill, Gomes|
+------------+



### 170 Common string Manipulation Functions for DataFrame columns

In [1]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', '/user/{username}/warehouse'). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Data Processing - Overview'). \
        master('yarn'). \
        getOrCreate()

In [2]:
employees = [
    (1, "Scott", "Tiger", 1000.0, 
      "united states", "+1 123 456 7890", "123 45 6789"
    ),
     (2, "Henry", "Ford", 1250.0, 
      "India", "+91 234 567 8901", "456 78 9123"
     ),
     (3, "Nick", "Junior", 750.0, 
      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
     ),
     (4, "Bill", "Gomes", 1500.0, 
      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
     )
]

In [3]:
employeesDF = spark.createDataFrame(employees,
            schema="""employee_id INT, first_name STRING, last_name STRING,
            salary FLOAT, nationality STRING,
            phone STRING, ssn STRING """)

In [4]:
from pyspark.sql.functions import concat

employeesDF. \
    withColumn("full_name", concat("first_name", "last_name")). \
    show()

+-----------+----------+---------+------+--------------+----------------+-----------+----------+
|employee_id|first_name|last_name|salary|   nationality|           phone|        ssn| full_name|
+-----------+----------+---------+------+--------------+----------------+-----------+----------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|ScottTiger|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123| HenryFord|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|NickJunior|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118| BillGomes|
+-----------+----------+---------+------+--------------+----------------+-----------+----------+



In [5]:
from pyspark.sql.functions import concat, lit

employeesDF. \
    withColumn("full_name", concat("first_name",lit(", "), "last_name")). \
    show()

+-----------+----------+---------+------+--------------+----------------+-----------+------------+
|employee_id|first_name|last_name|salary|   nationality|           phone|        ssn|   full_name|
+-----------+----------+---------+------+--------------+----------------+-----------+------------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|Scott, Tiger|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123| Henry, Ford|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|Nick, Junior|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118| Bill, Gomes|
+-----------+----------+---------+------+--------------+----------------+-----------+------------+



In [6]:
from pyspark.sql.functions import col, lower, upper, initcap, length

In [7]:
employeesDF. \
    select("employee_id", "nationality"). \
    withColumn("nationality_upper",upper(col("nationality"))). \
    withColumn("nationality_lower",lower(col("nationality"))). \
    withColumn("nationality_initcap",initcap(col("nationality"))). \
    withColumn("nationality_length",length(col("nationality"))). \
    show()

+-----------+--------------+-----------------+-----------------+-------------------+------------------+
|employee_id|   nationality|nationality_upper|nationality_lower|nationality_initcap|nationality_length|
+-----------+--------------+-----------------+-----------------+-------------------+------------------+
|          1| united states|    UNITED STATES|    united states|      United States|                13|
|          2|         India|            INDIA|            india|              India|                 5|
|          3|united KINGDOM|   UNITED KINGDOM|   united kingdom|     United Kingdom|                14|
|          4|     AUSTRALIA|        AUSTRALIA|        australia|          Australia|                 9|
+-----------+--------------+-----------------+-----------------+-------------------+------------------+



### 171 Extracting String using substring from Spark DataFrame columns

In [8]:
s = "Hello World"

In [9]:
s[:5]

'Hello'

In [10]:
s[1:4]

'ell'

In [11]:
l = [('X',)]

In [12]:
df = spark.createDataFrame(l, "dummy STRING")

In [13]:
from pyspark.sql.functions import substring, lit

In [14]:
df.select(substring(lit("Hello World"), 7, 5)).show()

+----------------------------+
|substring(Hello World, 7, 5)|
+----------------------------+
|                       World|
+----------------------------+



In [15]:
df.select(substring(lit("Hello World"), -5, 5)).show()

+-----------------------------+
|substring(Hello World, -5, 5)|
+-----------------------------+
|                        World|
+-----------------------------+



In [16]:
employeesDF 

employee_id,first_name,last_name,salary,nationality,phone,ssn
1,Scott,Tiger,1000.0,united states,+1 123 456 7890,123 45 6789
2,Henry,Ford,1250.0,India,+91 234 567 8901,456 78 9123
3,Nick,Junior,750.0,united KINGDOM,+44 111 111 1111,222 33 4444
4,Bill,Gomes,1500.0,AUSTRALIA,+61 987 654 3210,789 12 6118


In [17]:
from pyspark.sql.functions import substring, lit, cast

ImportError: cannot import name 'cast'

In [20]:
employeesDF. \
    select("employee_id","phone","ssn"). \
    withColumn("phone_last4", substring(col("phone"),-4,4).cast("int")). \
    withColumn("ssn_last4", substring(col("ssn"),8,4).cast("int")). \
    show()


+-----------+----------------+-----------+-----------+---------+
|employee_id|           phone|        ssn|phone_last4|ssn_last4|
+-----------+----------------+-----------+-----------+---------+
|          1| +1 123 456 7890|123 45 6789|       7890|     6789|
|          2|+91 234 567 8901|456 78 9123|       8901|     9123|
|          3|+44 111 111 1111|222 33 4444|       1111|     4444|
|          4|+61 987 654 3210|789 12 6118|       3210|     6118|
+-----------+----------------+-----------+-----------+---------+



In [18]:
employeesDF. \
    select("employee_id","phone","ssn"). \
    withColumn("phone_last4", substring(col("phone"),-4,4).cast("int")). \
    withColumn("ssn_last4", substring(col("ssn"),8,4).cast("int")). \
    printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- phone: string (nullable = true)
 |-- ssn: string (nullable = true)
 |-- phone_last4: integer (nullable = true)
 |-- ssn_last4: integer (nullable = true)



### 172 Extracting String using split from Spark DataFrame columns

In [1]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', '/user/{username}/warehouse'). \
        config('spark.shuffle.io.connectionTimeout',6000). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Data Processing - Overview'). \
        master('yarn'). \
        getOrCreate()

In [2]:
l = [('X',)]

In [3]:
df = spark.createDataFrame(l,"dummy STRING")

In [4]:
df

dummy
X


In [5]:
from pyspark.sql.functions import split, explode, lit

In [6]:
df.select(split(lit("Hello World, how are you"), " ")).show(truncate=False)

+--------------------------------------+
|split(Hello World, how are you,  , -1)|
+--------------------------------------+
|[Hello, World,, how, are, you]        |
+--------------------------------------+



In [7]:
df.select(split(lit("Hello World, how are you"), " ")[2]). \
    show(truncate=False)

+-----------------------------------------+
|split(Hello World, how are you,  , -1)[2]|
+-----------------------------------------+
|how                                      |
+-----------------------------------------+



In [8]:
df.select(explode(split(lit("Hello World, how are you"), " "))). \
    show(truncate=False)

+------+
|col   |
+------+
|Hello |
|World,|
|how   |
|are   |
|you   |
+------+



In [9]:
df.select(explode(split(lit("Hello World, how are you"), " ")).alias("word")). \
    show(truncate=False)

+------+
|word  |
+------+
|Hello |
|World,|
|how   |
|are   |
|you   |
+------+



In [10]:
employees = [(1, "Scott", "Tiger", 1000.0, 
                      "united states", "+1 123 456 7890,+1 234 567 8901", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, 
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, 
                      "united KINGDOM", "+44 111 111 1111,+44 222 222 2222", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 
                      "AUSTRALIA", "+61 987 654 3210,+61 876 543 2109", "789 12 6118"
                     )
                ]

In [11]:
employeesDF = spark.createDataFrame(employees,
                schema="""employee_id INT, first_name STRING, last_name STRING, 
                salary FLOAT, nationality STRING, phone_numbers STRING, ssn STRING
                """)

In [12]:
employeesDF.show(truncate=False)

+-----------+----------+---------+------+--------------+---------------------------------+-----------+
|employee_id|first_name|last_name|salary|nationality   |phone_numbers                    |ssn        |
+-----------+----------+---------+------+--------------+---------------------------------+-----------+
|1          |Scott     |Tiger    |1000.0|united states |+1 123 456 7890,+1 234 567 8901  |123 45 6789|
|2          |Henry     |Ford     |1250.0|India         |+91 234 567 8901                 |456 78 9123|
|3          |Nick      |Junior   |750.0 |united KINGDOM|+44 111 111 1111,+44 222 222 2222|222 33 4444|
|4          |Bill      |Gomes    |1500.0|AUSTRALIA     |+61 987 654 3210,+61 876 543 2109|789 12 6118|
+-----------+----------+---------+------+--------------+---------------------------------+-----------+



In [13]:
employeesDF.select('first_name', 'last_name','phone_numbers','ssn').show(truncate=False)

+----------+---------+---------------------------------+-----------+
|first_name|last_name|phone_numbers                    |ssn        |
+----------+---------+---------------------------------+-----------+
|Scott     |Tiger    |+1 123 456 7890,+1 234 567 8901  |123 45 6789|
|Henry     |Ford     |+91 234 567 8901                 |456 78 9123|
|Nick      |Junior   |+44 111 111 1111,+44 222 222 2222|222 33 4444|
|Bill      |Gomes    |+61 987 654 3210,+61 876 543 2109|789 12 6118|
+----------+---------+---------------------------------+-----------+



In [14]:
employeesDF.select('first_name', 'last_name','phone_numbers','ssn'). \
    withColumn('phone_number',split('phone_numbers',",")). \
    show(truncate=False)

+----------+---------+---------------------------------+-----------+------------------------------------+
|first_name|last_name|phone_numbers                    |ssn        |phone_number                        |
+----------+---------+---------------------------------+-----------+------------------------------------+
|Scott     |Tiger    |+1 123 456 7890,+1 234 567 8901  |123 45 6789|[+1 123 456 7890, +1 234 567 8901]  |
|Henry     |Ford     |+91 234 567 8901                 |456 78 9123|[+91 234 567 8901]                  |
|Nick      |Junior   |+44 111 111 1111,+44 222 222 2222|222 33 4444|[+44 111 111 1111, +44 222 222 2222]|
|Bill      |Gomes    |+61 987 654 3210,+61 876 543 2109|789 12 6118|[+61 987 654 3210, +61 876 543 2109]|
+----------+---------+---------------------------------+-----------+------------------------------------+



In [15]:
employeesDF.select('first_name', 'last_name','phone_numbers','ssn',split('phone_numbers',",").alias('phone_number')). \
    show(truncate=False)

+----------+---------+---------------------------------+-----------+------------------------------------+
|first_name|last_name|phone_numbers                    |ssn        |phone_number                        |
+----------+---------+---------------------------------+-----------+------------------------------------+
|Scott     |Tiger    |+1 123 456 7890,+1 234 567 8901  |123 45 6789|[+1 123 456 7890, +1 234 567 8901]  |
|Henry     |Ford     |+91 234 567 8901                 |456 78 9123|[+91 234 567 8901]                  |
|Nick      |Junior   |+44 111 111 1111,+44 222 222 2222|222 33 4444|[+44 111 111 1111, +44 222 222 2222]|
|Bill      |Gomes    |+61 987 654 3210,+61 876 543 2109|789 12 6118|[+61 987 654 3210, +61 876 543 2109]|
+----------+---------+---------------------------------+-----------+------------------------------------+



In [16]:
employeesDF.select('first_name', 'last_name','phone_numbers','ssn'). \
    withColumn('phone_number',explode(split('phone_numbers',","))). \
    show(truncate=False)

+----------+---------+---------------------------------+-----------+----------------+
|first_name|last_name|phone_numbers                    |ssn        |phone_number    |
+----------+---------+---------------------------------+-----------+----------------+
|Scott     |Tiger    |+1 123 456 7890,+1 234 567 8901  |123 45 6789|+1 123 456 7890 |
|Scott     |Tiger    |+1 123 456 7890,+1 234 567 8901  |123 45 6789|+1 234 567 8901 |
|Henry     |Ford     |+91 234 567 8901                 |456 78 9123|+91 234 567 8901|
|Nick      |Junior   |+44 111 111 1111,+44 222 222 2222|222 33 4444|+44 111 111 1111|
|Nick      |Junior   |+44 111 111 1111,+44 222 222 2222|222 33 4444|+44 222 222 2222|
|Bill      |Gomes    |+61 987 654 3210,+61 876 543 2109|789 12 6118|+61 987 654 3210|
|Bill      |Gomes    |+61 987 654 3210,+61 876 543 2109|789 12 6118|+61 876 543 2109|
+----------+---------+---------------------------------+-----------+----------------+



In [17]:
employeesDF.select('first_name', 'last_name','phone_numbers','ssn',explode(split('phone_numbers',",")).alias('phone_number')). \
    show(truncate=False)

+----------+---------+---------------------------------+-----------+----------------+
|first_name|last_name|phone_numbers                    |ssn        |phone_number    |
+----------+---------+---------------------------------+-----------+----------------+
|Scott     |Tiger    |+1 123 456 7890,+1 234 567 8901  |123 45 6789|+1 123 456 7890 |
|Scott     |Tiger    |+1 123 456 7890,+1 234 567 8901  |123 45 6789|+1 234 567 8901 |
|Henry     |Ford     |+91 234 567 8901                 |456 78 9123|+91 234 567 8901|
|Nick      |Junior   |+44 111 111 1111,+44 222 222 2222|222 33 4444|+44 111 111 1111|
|Nick      |Junior   |+44 111 111 1111,+44 222 222 2222|222 33 4444|+44 222 222 2222|
|Bill      |Gomes    |+61 987 654 3210,+61 876 543 2109|789 12 6118|+61 987 654 3210|
|Bill      |Gomes    |+61 987 654 3210,+61 876 543 2109|789 12 6118|+61 876 543 2109|
+----------+---------+---------------------------------+-----------+----------------+



In [18]:
employeesDF.select('first_name', 'last_name','phone_numbers','ssn'). \
    withColumn('phone_number',explode(split('phone_numbers',","))). \
    withColumn('area_code',split(explode(split('phone_numbers',","))," ")[1]). \
    withColumn('phone_last4',split(explode(split('phone_numbers',","))," ")[3]). \
    withColumn('ssn_last4',split('ssn'," ")[3]). \
    show(truncate=False)

AnalysisException: Generators are not supported when it's nested in expressions, but got: split(explode(split(phone_numbers, ,, -1)),  , -1)[1]

In [24]:
employeesDF.select('first_name', 'last_name','phone_numbers','ssn'). \
    withColumn('phone_number',explode(split('phone_numbers',","))). \
    select('first_name','last_name', 'phone_numbers', 'ssn','phone_number').\
    withColumn('area_code',split('phone_number'," ")[1]). \
    withColumn('phone_last4', split('phone_number', " ")[3]). \
    withColumn('ssn_last4', split('ssn'," ")[3]). \
    show(truncate=False)

+----------+---------+---------------------------------+-----------+----------------+---------+-----------+---------+
|first_name|last_name|phone_numbers                    |ssn        |phone_number    |area_code|phone_last4|ssn_last4|
+----------+---------+---------------------------------+-----------+----------------+---------+-----------+---------+
|Scott     |Tiger    |+1 123 456 7890,+1 234 567 8901  |123 45 6789|+1 123 456 7890 |123      |7890       |null     |
|Scott     |Tiger    |+1 123 456 7890,+1 234 567 8901  |123 45 6789|+1 234 567 8901 |234      |8901       |null     |
|Henry     |Ford     |+91 234 567 8901                 |456 78 9123|+91 234 567 8901|234      |8901       |null     |
|Nick      |Junior   |+44 111 111 1111,+44 222 222 2222|222 33 4444|+44 111 111 1111|111      |1111       |null     |
|Nick      |Junior   |+44 111 111 1111,+44 222 222 2222|222 33 4444|+44 222 222 2222|222      |2222       |null     |
|Bill      |Gomes    |+61 987 654 3210,+61 876 543 2109|

In [28]:
employeesDF.select('first_name', 'last_name','phone_numbers','ssn'). \
    withColumn('phone_number',explode(split('phone_numbers',","))). \
    select('first_name','last_name', 'phone_numbers', 'ssn','phone_number').\
    withColumn('area_code',split('phone_number'," ")[1].cast("int")). \
    withColumn('phone_last4', split('phone_number', " ")[3].cast("int")). \
    withColumn('ssn_last4', split('ssn'," ")[2].cast("int"))

first_name,last_name,phone_numbers,ssn,phone_number,area_code,phone_last4,ssn_last4
Scott,Tiger,"+1 123 456 7890,+...",123 45 6789,+1 123 456 7890,123,7890,6789
Scott,Tiger,"+1 123 456 7890,+...",123 45 6789,+1 234 567 8901,234,8901,6789
Henry,Ford,+91 234 567 8901,456 78 9123,+91 234 567 8901,234,8901,9123
Nick,Junior,"+44 111 111 1111,...",222 33 4444,+44 111 111 1111,111,1111,4444
Nick,Junior,"+44 111 111 1111,...",222 33 4444,+44 222 222 2222,222,2222,4444
Bill,Gomes,"+61 987 654 3210,...",789 12 6118,+61 987 654 3210,987,3210,6118
Bill,Gomes,"+61 987 654 3210,...",789 12 6118,+61 876 543 2109,876,2109,6118


In [75]:
employeesDF = employeesDF.select('first_name', 'last_name','phone_numbers','ssn'). \
                withColumn('phone_number',explode(split('phone_numbers',",")))

In [78]:
employeesDF.show(truncate=False)

+----------+---------+---------------------------------+-----------+----------------+
|first_name|last_name|phone_numbers                    |ssn        |phone_number    |
+----------+---------+---------------------------------+-----------+----------------+
|Scott     |Tiger    |+1 123 456 7890,+1 234 567 8901  |123 45 6789|+1 123 456 7890 |
|Scott     |Tiger    |+1 123 456 7890,+1 234 567 8901  |123 45 6789|+1 234 567 8901 |
|Henry     |Ford     |+91 234 567 8901                 |456 78 9123|+91 234 567 8901|
|Nick      |Junior   |+44 111 111 1111,+44 222 222 2222|222 33 4444|+44 111 111 1111|
|Nick      |Junior   |+44 111 111 1111,+44 222 222 2222|222 33 4444|+44 222 222 2222|
|Bill      |Gomes    |+61 987 654 3210,+61 876 543 2109|789 12 6118|+61 987 654 3210|
|Bill      |Gomes    |+61 987 654 3210,+61 876 543 2109|789 12 6118|+61 876 543 2109|
+----------+---------+---------------------------------+-----------+----------------+



In [82]:
employeesDF.select('first_name','last_name','phone_number','ssn'). \
    withColumn('area_code',split('phone_number'," ")[1].cast("int")). \
    withColumn('phone_last4',split('phone_number'," ")[3].cast("int")). \
    withColumn('ssn_last4',split('ssn'," ")[2].cast("int")). \
    show()

+----------+---------+----------------+-----------+---------+-----------+---------+
|first_name|last_name|    phone_number|        ssn|area_code|phone_last4|ssn_last4|
+----------+---------+----------------+-----------+---------+-----------+---------+
|     Scott|    Tiger| +1 123 456 7890|123 45 6789|      123|       7890|     6789|
|     Scott|    Tiger| +1 234 567 8901|123 45 6789|      234|       8901|     6789|
|     Henry|     Ford|+91 234 567 8901|456 78 9123|      234|       8901|     9123|
|      Nick|   Junior|+44 111 111 1111|222 33 4444|      111|       1111|     4444|
|      Nick|   Junior|+44 222 222 2222|222 33 4444|      222|       2222|     4444|
|      Bill|    Gomes|+61 987 654 3210|789 12 6118|      987|       3210|     6118|
|      Bill|    Gomes|+61 876 543 2109|789 12 6118|      876|       2109|     6118|
+----------+---------+----------------+-----------+---------+-----------+---------+



In [28]:
spark.driver.port

AttributeError: 'SparkSession' object has no attribute 'driver'

In [83]:
employeesDF

first_name,last_name,phone_numbers,ssn,phone_number
Scott,Tiger,"+1 123 456 7890,+...",123 45 6789,+1 123 456 7890
Scott,Tiger,"+1 123 456 7890,+...",123 45 6789,+1 234 567 8901
Henry,Ford,+91 234 567 8901,456 78 9123,+91 234 567 8901
Nick,Junior,"+44 111 111 1111,...",222 33 4444,+44 111 111 1111
Nick,Junior,"+44 111 111 1111,...",222 33 4444,+44 222 222 2222
Bill,Gomes,"+61 987 654 3210,...",789 12 6118,+61 987 654 3210
Bill,Gomes,"+61 987 654 3210,...",789 12 6118,+61 876 543 2109


In [85]:
employeesDF.groupBy('first_name','last_name'). \
    count(). \
    show()

+----------+---------+-----+
|first_name|last_name|count|
+----------+---------+-----+
|      Nick|   Junior|    2|
|     Henry|     Ford|    1|
|      Bill|    Gomes|    2|
|     Scott|    Tiger|    2|
+----------+---------+-----+



### 173 Padding characters around Strings in Spark DataFrame Columns

In [8]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', '/user/{username}/warehouse'). \
        config('spark.shuffle.io.connectionTimeout',6000). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Data Processing - Overview'). \
        master('yarn'). \
        getOrCreate()

In [9]:
l = [('X',)]

In [10]:
df = spark.createDataFrame(l,'dummy STRING')

In [11]:
df

dummy
X


In [13]:
from pyspark.sql.functions import lit

In [14]:
df.select(lit("Hello World!"))

Hello World!
Hello World!


In [15]:
df.select(lit("Hello World!").alias("dummy"))

dummy
Hello World!


In [16]:
from pyspark.sql.functions import lpad, rpad

In [17]:
df.select(lpad(lit("Hello"), 10, "-").alias("dummy")).show()

+----------+
|     dummy|
+----------+
|-----Hello|
+----------+



In [18]:
employees = [(1, "Scott", "Tiger", 1000.0, 
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, 
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, 
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
                ]

In [19]:
employeesDF = spark.createDataFrame(employees). \
    toDF("employee_id", "first_name",
        "last_name", "salary",
        "nationality", "phone_number",
        "ssn")

In [20]:
employeesDF.printSchema()

root
 |-- employee_id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- nationality: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- ssn: string (nullable = true)



In [22]:
employeesDF.describe()

summary,employee_id,first_name,last_name,salary,nationality,phone_number,ssn
count,4.0,4,4,4.0,4,4,4
mean,2.5,,,1125.0,,,
stddev,1.2909944487358056,,,322.7486121839514,,,
min,1.0,Bill,Ford,750.0,AUSTRALIA,+1 123 456 7890,123 45 6789
max,4.0,Scott,Tiger,1500.0,united states,+91 234 567 8901,789 12 6118


In [23]:
employeesDF

employee_id,first_name,last_name,salary,nationality,phone_number,ssn
1,Scott,Tiger,1000.0,united states,+1 123 456 7890,123 45 6789
2,Henry,Ford,1250.0,India,+91 234 567 8901,456 78 9123
3,Nick,Junior,750.0,united KINGDOM,+44 111 111 1111,222 33 4444
4,Bill,Gomes,1500.0,AUSTRALIA,+61 987 654 3210,789 12 6118


In [24]:
employeesDF.columns

['employee_id',
 'first_name',
 'last_name',
 'salary',
 'nationality',
 'phone_number',
 'ssn']

In [33]:
employeesDF.select(lpad("employee_id",5,"0")).show()

+-----------------------+
|lpad(employee_id, 5, 0)|
+-----------------------+
|                  00001|
|                  00002|
|                  00003|
|                  00004|
+-----------------------+



In [42]:
employeesDF.select(
    lpad("employee_id",5,"0"),
    rpad("first_name",10,"-"),
    rpad("last_name",10,"-"),
    lpad("salary",10,"0"),
    rpad("nationality",15,"-"),
    rpad("phone_number",17,"-"),
    "ssn"
    )

"lpad(employee_id, 5, 0)","rpad(first_name, 10, -)","rpad(last_name, 10, -)","lpad(salary, 10, 0)","rpad(nationality, 15, -)","rpad(phone_number, 17, -)",ssn
1,Scott-----,Tiger-----,1000.0,united states--,+1 123 456 7890--,123 45 6789
2,Henry-----,Ford------,1250.0,India----------,+91 234 567 8901-,456 78 9123
3,Nick------,Junior----,750.0,united KINGDOM-,+44 111 111 1111-,222 33 4444
4,Bill------,Gomes-----,1500.0,AUSTRALIA------,+61 987 654 3210-,789 12 6118


In [38]:
from pyspark.sql.functions import concat

In [43]:
empFixedDF = employeesDF.select(
    concat(
    lpad("employee_id",5,"0"),
    rpad("first_name",10,"-"),
    rpad("last_name",10,"-"),
    lpad("salary",10,"0"),
    rpad("nationality",15,"-"),
    rpad("phone_number",17,"-"),
    "ssn"
    ).alias("employees"))

In [46]:
empFixedDF.show(truncate=False)

+------------------------------------------------------------------------------+
|employees                                                                     |
+------------------------------------------------------------------------------+
|00001Scott-----Tiger-----00001000.0united states--+1 123 456 7890--123 45 6789|
|00002Henry-----Ford------00001250.0India----------+91 234 567 8901-456 78 9123|
|00003Nick------Junior----00000750.0united KINGDOM-+44 111 111 1111-222 33 4444|
|00004Bill------Gomes-----00001500.0AUSTRALIA------+61 987 654 3210-789 12 6118|
+------------------------------------------------------------------------------+



### 174 Trimming characters from strings

In [1]:
from pyspark.sql.functions import ltrim, rtrim, trim

In [2]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', '/user/{username}/warehouse'). \
        config('spark.shuffle.io.connectionTimeout',6000). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Data Processing - Overview'). \
        master('yarn'). \
        getOrCreate()

In [9]:
l = [("    Hello.    ",)]

In [10]:
df = spark.createDataFrame(l).toDF("dummy")

In [11]:
from pyspark.sql.functions import ltrim, rtrim, trim, col

In [12]:
df.withColumn("ltrim",ltrim("dummy")).show()

+--------------+----------+
|         dummy|     ltrim|
+--------------+----------+
|    Hello.    |Hello.    |
+--------------+----------+



In [13]:
df.withColumn("ltrim",ltrim(col("dummy"))).show()

+--------------+----------+
|         dummy|     ltrim|
+--------------+----------+
|    Hello.    |Hello.    |
+--------------+----------+



In [14]:
df.withColumn("ltrim",ltrim(col("dummy"))). \
    withColumn("rtrim", rtrim(col("dummy"))). \
    withColumn("trim", trim(col("dummy"))). \
show()

+--------------+----------+----------+------+
|         dummy|     ltrim|     rtrim|  trim|
+--------------+----------+----------+------+
|    Hello.    |Hello.    |    Hello.|Hello.|
+--------------+----------+----------+------+



In [16]:
spark.sql("describe function rtrim").show(truncate=False)

+-----------------------------------------------------------------------------+
|function_desc                                                                |
+-----------------------------------------------------------------------------+
|Function: rtrim                                                              |
|Class: org.apache.spark.sql.catalyst.expressions.StringTrimRight             |
|Usage: 
    rtrim(str) - Removes the trailing space characters from `str`.
  |
+-----------------------------------------------------------------------------+



In [18]:
spark.sql("show databases").show(3)

+--------------------+
|           namespace|
+--------------------+
|0000000000000_msdian|
|0000000009874_retail|
|          00000_2_db|
+--------------------+
only showing top 3 rows



In [21]:
from pyspark.sql.functions import expr

In [23]:
df.withColumn("ltrim", expr("ltrim(dummy)")). \
    withColumn("rtrim", expr("rtrim('.',rtrim(dummy))")). \
    withColumn("trim", trim(col("dummy"))). \
    show()

+--------------+----------+---------+------+
|         dummy|     ltrim|    rtrim|  trim|
+--------------+----------+---------+------+
|    Hello.    |Hello.    |    Hello|Hello.|
+--------------+----------+---------+------+



In [26]:
spark.sql("describe function trim").show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|function_desc                                                                                                                                                                                                                                                                             

In [28]:
df.withColumn("ltrim",expr("trim(LEADING ' ' FROM dummy)")). \
    withColumn("rtrim",expr("trim(TRAILING '.' from rtrim(dummy))")). \
    withColumn("trim", expr("trim(BOTH ' ' FROM dummy)")). \
    show(truncate=False)

+--------------+----------+---------+------+
|dummy         |ltrim     |rtrim    |trim  |
+--------------+----------+---------+------+
|    Hello.    |Hello.    |    Hello|Hello.|
+--------------+----------+---------+------+



### 175 Date and Time Manipulation Functions 

In [1]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession. \
        builder. \
        config('spark.ui.port','0'). \
        config('spark.sql.warehouse.dir', '/user/{username}/warehouse'). \
        config('spark.shuffle.io.connectionTimeout',6000). \
        enableHiveSupport(). \
        appName(f'{username} | Python - Data Processing - Overview'). \
        master('yarn'). \
        getOrCreate()

In [2]:
l = [('X',)]

In [3]:
df = spark.createDataFrame(l).toDF("dummy")

In [4]:
df

dummy
X


In [5]:
df.show()

+-----+
|dummy|
+-----+
|    X|
+-----+



In [6]:
from pyspark.sql.functions import current_date, current_timestamp

In [7]:
df.select(current_date()).show(truncate=False)

+--------------+
|current_date()|
+--------------+
|2024-02-09    |
+--------------+



In [8]:
df.select(current_timestamp()).show(truncate=False)

+-----------------------+
|current_timestamp()    |
+-----------------------+
|2024-02-09 13:32:59.513|
+-----------------------+



In [9]:
from pyspark.sql.functions import lit, to_date, to_timestamp

In [10]:
df.select(to_date(lit('20240209'), 'yyyyMMdd').alias('to_date')).show(truncate=False)

+----------+
|to_date   |
+----------+
|2024-02-09|
+----------+



In [18]:
df.select(to_date(lit('20240209 1725'), 'yyyyMMdd HHmm').alias('to_date')).show(truncate=False)

+----------+
|to_date   |
+----------+
|2024-02-09|
+----------+



In [19]:
df.select(to_timestamp(lit('20240209 1725'), 'yyyyMMdd HHmm').alias('to_timestamp')).show(truncate=False)

+-------------------+
|to_timestamp       |
+-------------------+
|2024-02-09 17:25:00|
+-------------------+



### 176 Date and Time ARithmetic on Spark Data Frames

In [2]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port',0). \
    config('spark.sql.warehouse.dir',f"/user/{username}/warehouse"). \
    config('spark.shuffle.io.connectionTimeOut',6000). \
    enableHiveSupport(). \
    appName(f"{username} | Pyspark Processing column Data"). \
    master('yarn'). \
    getOrCreate()
    

In [3]:
datetimes = [("2014-02-28", "2014-02-28 10:00:00.123"),
                     ("2016-02-29", "2016-02-29 08:08:08.999"),
                     ("2017-10-31", "2017-12-31 11:59:59.123"),
                     ("2019-11-30", "2019-08-31 00:00:00.000")
                ]

In [4]:
datetimesDF = spark.createDataFrame(datetimes,schema = "date STRING, time STRING")

In [5]:
datetimesDF.show(truncate=False)

+----------+-----------------------+
|date      |time                   |
+----------+-----------------------+
|2014-02-28|2014-02-28 10:00:00.123|
|2016-02-29|2016-02-29 08:08:08.999|
|2017-10-31|2017-12-31 11:59:59.123|
|2019-11-30|2019-08-31 00:00:00.000|
+----------+-----------------------+



In [7]:
from pyspark.sql.functions import date_add, date_sub

In [8]:
help(date_add)

Help on function date_add in module pyspark.sql.functions:

date_add(start, days)
    Returns the date that is `days` days after `start`
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
    >>> df.select(date_add(df.dt, 1).alias('next_date')).collect()
    [Row(next_date=datetime.date(2015, 4, 9))]



In [9]:
help(date_sub)

Help on function date_sub in module pyspark.sql.functions:

date_sub(start, days)
    Returns the date that is `days` days before `start`
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
    >>> df.select(date_sub(df.dt, 1).alias('prev_date')).collect()
    [Row(prev_date=datetime.date(2015, 4, 7))]



In [10]:
datetimesDF. \
    withColumn("date_add_date",date_add("date",10)). \
    withColumn("time_add_date", date_add("time",10)). \
    withColumn("date_sub_date",date_sub("date",10)). \
    withColumn("time_sub_date", date_sub("time",10)). \
show(truncate=False)

+----------+-----------------------+-------------+-------------+-------------+-------------+
|date      |time                   |date_add_date|time_add_date|date_sub_date|time_sub_date|
+----------+-----------------------+-------------+-------------+-------------+-------------+
|2014-02-28|2014-02-28 10:00:00.123|2014-03-10   |2014-03-10   |2014-02-18   |2014-02-18   |
|2016-02-29|2016-02-29 08:08:08.999|2016-03-10   |2016-03-10   |2016-02-19   |2016-02-19   |
|2017-10-31|2017-12-31 11:59:59.123|2017-11-10   |2018-01-10   |2017-10-21   |2017-12-21   |
|2019-11-30|2019-08-31 00:00:00.000|2019-12-10   |2019-09-10   |2019-11-20   |2019-08-21   |
+----------+-----------------------+-------------+-------------+-------------+-------------+



In [12]:
from pyspark.sql.functions import current_date,current_timestamp, datediff

In [15]:
datetimesDF. \
    withColumn("datediff_date",datediff(current_date(),"date")). \
    withColumn("datediff_time",datediff(current_timestamp(), "time")). \
show(truncate=False)

+----------+-----------------------+-------------+-------------+
|date      |time                   |datediff_date|datediff_time|
+----------+-----------------------+-------------+-------------+
|2014-02-28|2014-02-28 10:00:00.123|3634         |3634         |
|2016-02-29|2016-02-29 08:08:08.999|2903         |2903         |
|2017-10-31|2017-12-31 11:59:59.123|2293         |2232         |
|2019-11-30|2019-08-31 00:00:00.000|1533         |1624         |
+----------+-----------------------+-------------+-------------+



In [18]:
from pyspark.sql.functions import months_between, add_months, round

In [19]:
help(months_between)

Help on function months_between in module pyspark.sql.functions:

months_between(date1, date2, roundOff=True)
    Returns number of months between dates date1 and date2.
    If date1 is later than date2, then the result is positive.
    If date1 and date2 are on the same day of month, or both are the last day of month,
    returns an integer (time of day will be ignored).
    The result is rounded off to 8 digits unless `roundOff` is set to `False`.
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['date1', 'date2'])
    >>> df.select(months_between(df.date1, df.date2).alias('months')).collect()
    [Row(months=3.94959677)]
    >>> df.select(months_between(df.date1, df.date2, False).alias('months')).collect()
    [Row(months=3.9495967741935485)]



In [22]:
datetimesDF. \
    withColumn("months_between_date",round(months_between(current_date(),"date"),2)). \
    withColumn("months_between_time", round(months_between(current_timestamp(),"time"),2)). \
    withColumn("add_months_date",add_months("date",3)). \
    withColumn("add_months_time", add_months("time",3)). \
show(truncate=False)

+----------+-----------------------+-------------------+-------------------+---------------+---------------+
|date      |time                   |months_between_date|months_between_time|add_months_date|add_months_time|
+----------+-----------------------+-------------------+-------------------+---------------+---------------+
|2014-02-28|2014-02-28 10:00:00.123|119.42             |119.41             |2014-05-28     |2014-05-28     |
|2016-02-29|2016-02-29 08:08:08.999|95.39              |95.38              |2016-05-29     |2016-05-29     |
|2017-10-31|2017-12-31 11:59:59.123|75.32              |73.31              |2018-01-31     |2018-03-31     |
|2019-11-30|2019-08-31 00:00:00.000|50.35              |53.33              |2020-02-29     |2019-11-30     |
+----------+-----------------------+-------------------+-------------------+---------------+---------------+



### 177 Using Date and Time Trunc functions on spark Data Frame columns

In [1]:
import getpass

username = getpass.getuser()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder. \
    config("spark.ui.port",0). \
    config("spark.sql.warehouse.dir",f"/user/{username}/warehouse"). \
    config("spark.shuffle.io.connectionTimeOut",6000). \
    enableHiveSupport(). \
    appName(f"{username} | Python Processing Column Data"). \
    master('yarn'). \
    getOrCreate()

In [3]:
l = [('X',)]

In [4]:
df = spark.createDataFrame(l).toDF("dummy")

In [5]:
df

dummy
X


In [8]:
datetimes = [("2014-02-28", "2014-02-28 10:00:00.123"),
                     ("2016-02-29", "2016-02-29 08:08:08.999"),
                     ("2017-10-31", "2017-12-31 11:59:59.123"),
                     ("2019-11-30", "2019-08-31 00:00:00.000")
                ]

In [14]:
datetimesDF = spark.createDataFrame(datetimes,"date STRING, time STRING")

In [15]:
from pyspark.sql.functions import trunc

In [16]:
datetimesDF. \
    withColumn("date_trunc",trunc("date","MM")). \
    withColumn("time_trunc", trunc("time", "yy")). \
show(truncate=False)

+----------+-----------------------+----------+----------+
|date      |time                   |date_trunc|time_trunc|
+----------+-----------------------+----------+----------+
|2014-02-28|2014-02-28 10:00:00.123|2014-02-01|2014-01-01|
|2016-02-29|2016-02-29 08:08:08.999|2016-02-01|2016-01-01|
|2017-10-31|2017-12-31 11:59:59.123|2017-10-01|2017-01-01|
|2019-11-30|2019-08-31 00:00:00.000|2019-11-01|2019-01-01|
+----------+-----------------------+----------+----------+



In [17]:
from pyspark.sql.functions import date_trunc

In [19]:
datetimesDF. \
    withColumn("date_trunc",date_trunc("MM","date")).\
    withColumn("time_trunc",date_trunc("yy","time")). \
show(truncate=False)

+----------+-----------------------+-------------------+-------------------+
|date      |time                   |date_trunc         |time_trunc         |
+----------+-----------------------+-------------------+-------------------+
|2014-02-28|2014-02-28 10:00:00.123|2014-02-01 00:00:00|2014-01-01 00:00:00|
|2016-02-29|2016-02-29 08:08:08.999|2016-02-01 00:00:00|2016-01-01 00:00:00|
|2017-10-31|2017-12-31 11:59:59.123|2017-10-01 00:00:00|2017-01-01 00:00:00|
|2019-11-30|2019-08-31 00:00:00.000|2019-11-01 00:00:00|2019-01-01 00:00:00|
+----------+-----------------------+-------------------+-------------------+



In [23]:
datetimesDF. \
    withColumn("date_dt",date_trunc("Hour","date")). \
    withColumn("time_dt",date_trunc("Hour","time")). \
    withColumn("time_dt1",date_trunc("dd","time")). \
show(truncate=False)


+----------+-----------------------+-------------------+-------------------+-------------------+
|date      |time                   |date_dt            |time_dt            |time_dt1           |
+----------+-----------------------+-------------------+-------------------+-------------------+
|2014-02-28|2014-02-28 10:00:00.123|2014-02-28 00:00:00|2014-02-28 10:00:00|2014-02-28 00:00:00|
|2016-02-29|2016-02-29 08:08:08.999|2016-02-29 00:00:00|2016-02-29 08:00:00|2016-02-29 00:00:00|
|2017-10-31|2017-12-31 11:59:59.123|2017-10-31 00:00:00|2017-12-31 11:00:00|2017-12-31 00:00:00|
|2019-11-30|2019-08-31 00:00:00.000|2019-11-30 00:00:00|2019-08-31 00:00:00|2019-08-31 00:00:00|
+----------+-----------------------+-------------------+-------------------+-------------------+



###  178 Date and Time Extract functions 

In [1]:
import getpass

username = getpass.getuser()

In [2]:
username

'itv011204'

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder. \
    config("spark.ui.port",0). \
    config("spark.sql.warehouse.dir",f"/user/{username}/warehouse"). \
    config("spark.shuffle.io.connectionTimeout",6000). \
    enableHiveSupport(). \
    appName(f"{username} | Python Processing Data columns"). \
    master("yarn"). \
    getOrCreate()

In [4]:
l = [('X',)]

In [5]:
df = spark.createDataFrame(l,schema="dummy STRING")

In [6]:
df

dummy
X


In [7]:
from pyspark.sql.functions import weekofyear, dayofmonth, dayofweek, dayofyear, year, month

from pyspark.sql.functions import current_date

In [9]:
df.select(current_date().alias("current_date"),
         year(current_date().alias("year")),
         month(current_date().alias("month")),
         weekofyear(current_date()).alias("week_of_year"),
         dayofmonth(current_date()).alias("day_of_month"),
         dayofyear(current_date()).alias("day_of_year"),
         dayofweek(current_date()).alias("dayofweek")). \
show(truncate=False)

+------------+------------------------------+--------------------------------+------------+------------+-----------+---------+
|current_date|year(current_date() AS `year`)|month(current_date() AS `month`)|week_of_year|day_of_month|day_of_year|dayofweek|
+------------+------------------------------+--------------------------------+------------+------------+-----------+---------+
|2024-02-10  |2024                          |2                               |6           |10          |41         |7        |
+------------+------------------------------+--------------------------------+------------+------------+-----------+---------+



In [10]:
from pyspark.sql.functions import current_timestamp, hour, minute, second 

In [11]:
df.select(current_timestamp().alias("current_timestamp"),
         hour(current_timestamp()).alias("hour"),
         minute(current_timestamp()).alias("minute"),
         second(current_timestamp()).alias("second")).\
show(truncate=False)

+-----------------------+----+------+------+
|current_timestamp      |hour|minute|second|
+-----------------------+----+------+------+
|2024-02-10 11:14:50.558|11  |14    |50    |
+-----------------------+----+------+------+



In [12]:
df.select(current_timestamp().alias("current_timestamp"),
         year(current_timestamp()).alias("year"),
         month(current_timestamp()).alias("month"),
         dayofmonth(current_timestamp()).alias("dayofmonth"),
         hour(current_timestamp()).alias("hour"),
         minute(current_timestamp()).alias("minute"),
         second(current_timestamp()).alias("second")).\
show(truncate=False)

+-----------------------+----+-----+----------+----+------+------+
|current_timestamp      |year|month|dayofmonth|hour|minute|second|
+-----------------------+----+-----+----------+----+------+------+
|2024-02-10 11:14:52.502|2024|2    |10        |11  |14    |52    |
+-----------------------+----+-----+----------+----+------+------+



In [13]:
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder. \
    config("spark.ui.port",0). \
    config("spark.sql.warehouse.dir",f"/user/{username}/warehouse"). \
    config("spark.shuffle.io.connectionTimeout",6000). \
    enableHiveSupport(). \
    appName(f"{username} | Python Processing Data columns"). \
    master("yarn"). \
    getOrCreate()

In [14]:
datetimes = [("2014-02-28", "2014-02-28 10:00:00.123"),
                     ("2016-02-29", "2016-02-29 08:08:08.999"),
                     ("2017-10-31", "2017-12-31 11:59:59.123"),
                     ("2019-11-30", "2019-08-31 00:00:00.000")
                ]

In [15]:
datetimesDF = spark.createDataFrame(datetimes).toDF("date", "time")

In [16]:
datetimesDF.show(truncate=False)

+----------+-----------------------+
|date      |time                   |
+----------+-----------------------+
|2014-02-28|2014-02-28 10:00:00.123|
|2016-02-29|2016-02-29 08:08:08.999|
|2017-10-31|2017-12-31 11:59:59.123|
|2019-11-30|2019-08-31 00:00:00.000|
+----------+-----------------------+



----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 52352)
Traceback (most recent call last):
  File "/opt/anaconda3/envs/beakerx/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/anaconda3/envs/beakerx/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/anaconda3/envs/beakerx/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/anaconda3/envs/beakerx/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/opt/spark-3.1.2-bin-hadoop3.2/python/pyspark/accumulators.py", line 262, in handle
    poll(accum_updates)
  File "/opt/spark-3.1.2-bin-hadoop3.2/python/pyspark/accumulators.py", line 235, in poll
    if func():
  File "/opt/spark-3.1.2-bin-hadoop3.2/python/pyspark/accumulato

### 179 to_date to_timestamp

In [1]:
import getpass

username = getpass.getuser()

In [2]:
username

'itv011204'

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession. \
        builder. \
        config('spark.ui.port',0). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse'). \
        config('spark.sql.io.connectionTimeout','6000'). \
        enableHiveSupport(). \
        appName(f'{username} | Data Processing functions'). \
        master('yarn'). \
        getOrCreate()

In [4]:
datetimes = [(20140228, "28-Feb-2014 10:00:00.123"),
                     (20160229, "20-Feb-2016 08:08:08.999"),
                     (20171031, "31-Dec-2017 11:59:59.123"),
                     (20191130, "31-Aug-2019 00:00:00.000")
                ]

In [5]:
datetimesDF = spark.createDataFrame(datetimes,schema="date BIGINT, time STRING")

In [6]:
datetimesDF.show(truncate=False)

+--------+------------------------+
|date    |time                    |
+--------+------------------------+
|20140228|28-Feb-2014 10:00:00.123|
|20160229|20-Feb-2016 08:08:08.999|
|20171031|31-Dec-2017 11:59:59.123|
|20191130|31-Aug-2019 00:00:00.000|
+--------+------------------------+



In [7]:
datetimesDF.printSchema()

root
 |-- date: long (nullable = true)
 |-- time: string (nullable = true)



In [8]:
l=[('X',)]

In [9]:
df = spark.createDataFrame(l).toDF('dummy')

In [10]:
df

dummy
X


In [11]:
from pyspark.sql.functions import to_date, lit

In [12]:
help(to_date)

Help on function to_date in module pyspark.sql.functions:

to_date(col, format=None)
    Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.DateType`
    using the optionally specified format. Specify formats according to `datetime pattern`_.
    By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format
    is omitted. Equivalent to ``col.cast("date")``.
    
    .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
    
    .. versionadded:: 2.2.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
    >>> df.select(to_date(df.t).alias('date')).collect()
    [Row(date=datetime.date(1997, 2, 28))]
    
    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
    >>> df.select(to_date(df.t, 'yyyy-MM-dd HH:mm:ss').alias('date')).collect()
    [Row(date=datetime.date(1997, 2, 28))]



In [13]:
df.select(to_date(lit('20210302'),'yyyyMMdd').alias("to_date")).show(truncate=False)

+----------+
|to_date   |
+----------+
|2021-03-02|
+----------+



In [14]:
df.select(to_date(lit('2021061'),'yyyyDDD').alias("to_date")).show(truncate=False)

+----------+
|to_date   |
+----------+
|2021-03-02|
+----------+



In [15]:
df.select(to_date(lit('02/03/2021'),'dd/MM/yyyy').alias('to_date')).show(truncate=False)

+----------+
|to_date   |
+----------+
|2021-03-02|
+----------+



In [16]:
df.select(to_date(lit('02-03-2021'),'dd-MM-yyyy').alias('to_date')).show(truncate=False)

+----------+
|to_date   |
+----------+
|2021-03-02|
+----------+



In [17]:
df.select(to_date(lit('02-Mar-2021'),'dd-MMM-yyyy').alias('to_date')).show(truncate=False)

+----------+
|to_date   |
+----------+
|2021-03-02|
+----------+



In [18]:
df.select(to_date(lit('02-March-2021'),'dd-MMMM-yyyy').alias('to_date')).show(truncate=False)

+----------+
|to_date   |
+----------+
|2021-03-02|
+----------+



In [19]:
from pyspark.sql.functions import to_timestamp

In [20]:
df.select(to_timestamp(lit('02-Mar-2021'),'dd-MMM-yyyy').alias('to_timestamp')).show(truncate=False)

+-------------------+
|to_timestamp       |
+-------------------+
|2021-03-02 00:00:00|
+-------------------+



In [21]:
df.select(to_timestamp(lit('02Mar2021'),'ddMMMyyyy').alias('to_timestamp')).show(truncate=False)

+-------------------+
|to_timestamp       |
+-------------------+
|2021-03-02 00:00:00|
+-------------------+



In [22]:
df.select(to_timestamp(lit('02-Mar-2021 17:30:15'),'dd-MMM-yyyy HH:mm:ss').alias('to_timestamp')).show(truncate=False)

+-------------------+
|to_timestamp       |
+-------------------+
|2021-03-02 17:30:15|
+-------------------+



In [23]:
datetimesDF.show(truncate=False)

+--------+------------------------+
|date    |time                    |
+--------+------------------------+
|20140228|28-Feb-2014 10:00:00.123|
|20160229|20-Feb-2016 08:08:08.999|
|20171031|31-Dec-2017 11:59:59.123|
|20191130|31-Aug-2019 00:00:00.000|
+--------+------------------------+



In [24]:
datetimesDF.printSchema()

root
 |-- date: long (nullable = true)
 |-- time: string (nullable = true)



In [25]:
from pyspark.sql.functions import col

In [26]:
datetimesDF.select(
            to_date(col('date').cast('string'),'yyyyMMdd').alias('to_date'),
            to_timestamp(col('date').cast('string'),'yyyyMMdd').alias('to_timestamp')). \
    show(truncate=False)

+----------+-------------------+
|to_date   |to_timestamp       |
+----------+-------------------+
|2014-02-28|2014-02-28 00:00:00|
|2016-02-29|2016-02-29 00:00:00|
|2017-10-31|2017-10-31 00:00:00|
|2019-11-30|2019-11-30 00:00:00|
+----------+-------------------+



In [31]:
datetimesDF.select(
    to_date(col('date').cast('string'),'yyyyMMdd').alias('to_date1'),
    to_date(col('time'),'dd-MMM-yyyy HH:mm:ss.SSS').alias('to_date2'),
    to_timestamp(col('date').cast('string'),'yyyyMMdd').alias('to_timestamp1'),
    to_timestamp(col('time'),'dd-MMM-yyyy HH:mm:ss.SSS').alias('to_timestamp2')
). \
show(truncate=False)

+----------+----------+-------------------+-----------------------+
|to_date1  |to_date2  |to_timestamp1      |to_timestamp2          |
+----------+----------+-------------------+-----------------------+
|2014-02-28|2014-02-28|2014-02-28 00:00:00|2014-02-28 10:00:00.123|
|2016-02-29|2016-02-20|2016-02-29 00:00:00|2016-02-20 08:08:08.999|
|2017-10-31|2017-12-31|2017-10-31 00:00:00|2017-12-31 11:59:59.123|
|2019-11-30|2019-08-31|2019-11-30 00:00:00|2019-08-31 00:00:00    |
+----------+----------+-------------------+-----------------------+



### 180 Using date_format

In [34]:
import getpass

username = getpass.getuser()

In [35]:
from pyspark.sql import SparkSession

spark = SparkSession. \
        builder. \
        config('spark.ui.port',0). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse'). \
        config('spark.sql.io.connectionTimeout','6000'). \
        enableHiveSupport(). \
        appName(f'{username} | Data Processing functions'). \
        master('yarn'). \
        getOrCreate()

In [36]:
datetimes = [("2014-02-28", "2014-02-28 10:00:00.123"),
                     ("2016-02-29", "2016-02-29 08:08:08.999"),
                     ("2017-10-31", "2017-12-31 11:59:59.123"),
                     ("2019-11-30", "2019-08-31 00:00:00.000")
                ]

In [37]:
datetimesDF = spark.createDataFrame(datetimes, schema="date STRING, time STRING")

In [38]:
datetimesDF.show(truncate=False)

+----------+-----------------------+
|date      |time                   |
+----------+-----------------------+
|2014-02-28|2014-02-28 10:00:00.123|
|2016-02-29|2016-02-29 08:08:08.999|
|2017-10-31|2017-12-31 11:59:59.123|
|2019-11-30|2019-08-31 00:00:00.000|
+----------+-----------------------+



In [41]:
from pyspark.sql.functions import date_format

In [42]:
datetimesDF. \
    withColumn('date_ym',date_format('date','yyyyMM')). \
    withColumn('time_ym',date_format('time','yyyyMM')). \
show(truncate=False)

+----------+-----------------------+-------+-------+
|date      |time                   |date_ym|time_ym|
+----------+-----------------------+-------+-------+
|2014-02-28|2014-02-28 10:00:00.123|201402 |201402 |
|2016-02-29|2016-02-29 08:08:08.999|201602 |201602 |
|2017-10-31|2017-12-31 11:59:59.123|201710 |201712 |
|2019-11-30|2019-08-31 00:00:00.000|201911 |201908 |
+----------+-----------------------+-------+-------+



In [44]:
datetimesDF. \
    withColumn('date_ym',date_format('date','yyyyMM').cast('int')). \
    withColumn('time_ym',date_format('time','yyyyMM').cast('int')). \
show(truncate=False)

+----------+-----------------------+-------+-------+
|date      |time                   |date_ym|time_ym|
+----------+-----------------------+-------+-------+
|2014-02-28|2014-02-28 10:00:00.123|201402 |201402 |
|2016-02-29|2016-02-29 08:08:08.999|201602 |201602 |
|2017-10-31|2017-12-31 11:59:59.123|201710 |201712 |
|2019-11-30|2019-08-31 00:00:00.000|201911 |201908 |
+----------+-----------------------+-------+-------+



In [45]:
datetimesDF. \
    withColumn('date_ym',date_format('date','yyyyMMddHHmmss')). \
    withColumn('time_ym',date_format('time','yyyyMMddHHmmss')). \
show(truncate=False)

+----------+-----------------------+--------------+--------------+
|date      |time                   |date_ym       |time_ym       |
+----------+-----------------------+--------------+--------------+
|2014-02-28|2014-02-28 10:00:00.123|20140228000000|20140228100000|
|2016-02-29|2016-02-29 08:08:08.999|20160229000000|20160229080808|
|2017-10-31|2017-12-31 11:59:59.123|20171031000000|20171231115959|
|2019-11-30|2019-08-31 00:00:00.000|20191130000000|20190831000000|
+----------+-----------------------+--------------+--------------+



In [46]:
datetimesDF. \
    withColumn('date_ym',date_format('date','yyyyMMddHHmmss')). \
    withColumn('time_ym',date_format('time','yyyyMMddHHmmss')). \
printSchema()

root
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- date_ym: string (nullable = true)
 |-- time_ym: string (nullable = true)



In [47]:
datetimesDF. \
    withColumn('date_ym',date_format('date','yyyyMMddHHmmss').cast('long')). \
    withColumn('time_ym',date_format('time','yyyyMMddHHmmss').cast('long')). \
show(truncate=False)

+----------+-----------------------+--------------+--------------+
|date      |time                   |date_ym       |time_ym       |
+----------+-----------------------+--------------+--------------+
|2014-02-28|2014-02-28 10:00:00.123|20140228000000|20140228100000|
|2016-02-29|2016-02-29 08:08:08.999|20160229000000|20160229080808|
|2017-10-31|2017-12-31 11:59:59.123|20171031000000|20171231115959|
|2019-11-30|2019-08-31 00:00:00.000|20191130000000|20190831000000|
+----------+-----------------------+--------------+--------------+



In [48]:
datetimesDF. \
    withColumn('date_ym',date_format('date','yyyyDDD')). \
    withColumn('time_ym',date_format('time','yyyyDDD')). \
printSchema()

root
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- date_ym: string (nullable = true)
 |-- time_ym: string (nullable = true)



In [49]:
datetimesDF. \
    withColumn('date_ym',date_format('date','yyyyDDD')). \
    withColumn('time_ym',date_format('time','yyyyDDD')). \
show(truncate=False)

+----------+-----------------------+-------+-------+
|date      |time                   |date_ym|time_ym|
+----------+-----------------------+-------+-------+
|2014-02-28|2014-02-28 10:00:00.123|2014059|2014059|
|2016-02-29|2016-02-29 08:08:08.999|2016060|2016060|
|2017-10-31|2017-12-31 11:59:59.123|2017304|2017365|
|2019-11-30|2019-08-31 00:00:00.000|2019334|2019243|
+----------+-----------------------+-------+-------+



In [50]:
datetimesDF. \
    withColumn('date_ym',date_format('date','MMMM d, yyyy')). \
    withColumn('time_ym',date_format('time','MMMM d, yyyy')). \
show(truncate=False)

+----------+-----------------------+-----------------+-----------------+
|date      |time                   |date_ym          |time_ym          |
+----------+-----------------------+-----------------+-----------------+
|2014-02-28|2014-02-28 10:00:00.123|February 28, 2014|February 28, 2014|
|2016-02-29|2016-02-29 08:08:08.999|February 29, 2016|February 29, 2016|
|2017-10-31|2017-12-31 11:59:59.123|October 31, 2017 |December 31, 2017|
|2019-11-30|2019-08-31 00:00:00.000|November 30, 2019|August 31, 2019  |
+----------+-----------------------+-----------------+-----------------+



In [53]:
datetimesDF. \
    withColumn('date_ym',date_format('date','EE')). \
    withColumn('time_ym',date_format('time','EEEE')). \
show(truncate=False)

+----------+-----------------------+-------+--------+
|date      |time                   |date_ym|time_ym |
+----------+-----------------------+-------+--------+
|2014-02-28|2014-02-28 10:00:00.123|Fri    |Friday  |
|2016-02-29|2016-02-29 08:08:08.999|Mon    |Monday  |
|2017-10-31|2017-12-31 11:59:59.123|Tue    |Sunday  |
|2019-11-30|2019-08-31 00:00:00.000|Sat    |Saturday|
+----------+-----------------------+-------+--------+



### 181 Dealing with Unix Timestamp

In [1]:
import getpass

username = getpass.getuser()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession. \
        builder. \
        config('spark.ui.port',0). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse'). \
        config('spark.sql.io.connectionTimeout','6000'). \
        enableHiveSupport(). \
        appName(f'{username} | Data Processing functions'). \
        master('yarn'). \
        getOrCreate()

In [9]:
from pyspark.sql.functions import unix_timestamp, col

In [10]:
datetimes = [(20140228, "2014-02-28", "2014-02-28 10:00:00.123"),
                     (20160229, "2016-02-29", "2016-02-29 08:08:08.999"),
                     (20171031, "2017-10-31", "2017-12-31 11:59:59.123"),
                     (20191130, "2019-11-30", "2019-08-31 00:00:00.000")
                ]

In [11]:
datetimesDF = spark.createDataFrame(datetimes).toDF("dateid","date","time")

In [12]:
datetimesDF.show(truncate=False)

+--------+----------+-----------------------+
|dateid  |date      |time                   |
+--------+----------+-----------------------+
|20140228|2014-02-28|2014-02-28 10:00:00.123|
|20160229|2016-02-29|2016-02-29 08:08:08.999|
|20171031|2017-10-31|2017-12-31 11:59:59.123|
|20191130|2019-11-30|2019-08-31 00:00:00.000|
+--------+----------+-----------------------+



In [13]:
datetimesDF.printSchema()

root
 |-- dateid: long (nullable = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)



In [14]:
datetimesDF. \
    withColumn("unix_date_id",unix_timestamp(col('dateid').cast('string'),'yyyyMMdd')). \
show(truncate=False)

+--------+----------+-----------------------+------------+
|dateid  |date      |time                   |unix_date_id|
+--------+----------+-----------------------+------------+
|20140228|2014-02-28|2014-02-28 10:00:00.123|1393563600  |
|20160229|2016-02-29|2016-02-29 08:08:08.999|1456722000  |
|20171031|2017-10-31|2017-12-31 11:59:59.123|1509422400  |
|20191130|2019-11-30|2019-08-31 00:00:00.000|1575090000  |
+--------+----------+-----------------------+------------+



In [15]:
datetimesDF. \
    withColumn("unix_date_id",unix_timestamp(col('dateid').cast('string'),'yyyyMMdd')). \
    withColumn("unix_date",unix_timestamp('date','yyyy-MM-dd')). \
    withColumn("unix_time",unix_timestamp('time','yyyy-MM-dd HH:mm:ss.SSS')). \
show(truncate=False)

+--------+----------+-----------------------+------------+----------+----------+
|dateid  |date      |time                   |unix_date_id|unix_date |unix_time |
+--------+----------+-----------------------+------------+----------+----------+
|20140228|2014-02-28|2014-02-28 10:00:00.123|1393563600  |1393563600|1393599600|
|20160229|2016-02-29|2016-02-29 08:08:08.999|1456722000  |1456722000|1456751288|
|20171031|2017-10-31|2017-12-31 11:59:59.123|1509422400  |1509422400|1514739599|
|20191130|2019-11-30|2019-08-31 00:00:00.000|1575090000  |1575090000|1567224000|
+--------+----------+-----------------------+------------+----------+----------+



In [16]:
from pyspark.sql.functions import from_unixtime

In [17]:
unixtimes = [(1393561800, ),
             (1456713488, ),
             (1514701799, ),
             (1567189800, )
            ]

In [18]:
unixtimesDF = spark.createDataFrame(unixtimes).toDF("unixtime")

In [19]:
unixtimesDF.show()

+----------+
|  unixtime|
+----------+
|1393561800|
|1456713488|
|1514701799|
|1567189800|
+----------+



In [20]:
unixtimesDF. \
    withColumn('date',from_unixtime('unixtime','yyyy-MM-dd')). \
show(truncate=False)

+----------+----------+
|unixtime  |date      |
+----------+----------+
|1393561800|2014-02-27|
|1456713488|2016-02-28|
|1514701799|2017-12-31|
|1567189800|2019-08-30|
+----------+----------+



In [21]:
unixtimesDF. \
    withColumn('date',from_unixtime('unixtime','yyyy-MM-dd')). \
    withColumn('time',from_unixtime('unixtime','yyyy-MM-dd HH:mm:ss')). \
    withColumn('date description',from_unixtime('unixtime','MMMM d, yyyy')). \
show(truncate=False)

+----------+----------+-------------------+-----------------+
|unixtime  |date      |time               |date description |
+----------+----------+-------------------+-----------------+
|1393561800|2014-02-27|2014-02-27 23:30:00|February 27, 2014|
|1456713488|2016-02-28|2016-02-28 21:38:08|February 28, 2016|
|1514701799|2017-12-31|2017-12-31 01:29:59|December 31, 2017|
|1567189800|2019-08-30|2019-08-30 14:30:00|August 30, 2019  |
+----------+----------+-------------------+-----------------+



In [22]:
unixtimesDF. \
    withColumn('date',from_unixtime('unixtime','yyyy-MM-dd')). \
    withColumn('time',from_unixtime('unixtime')). \
show(truncate=False)

+----------+----------+-------------------+
|unixtime  |date      |time               |
+----------+----------+-------------------+
|1393561800|2014-02-27|2014-02-27 23:30:00|
|1456713488|2016-02-28|2016-02-28 21:38:08|
|1514701799|2017-12-31|2017-12-31 01:29:59|
|1567189800|2019-08-30|2019-08-30 14:30:00|
+----------+----------+-------------------+



### 182  dealing with Nulls 

In [23]:
import getpass

username = getpass.getuser()

In [24]:
from pyspark.sql import SparkSession

spark = SparkSession. \
        builder. \
        config('spark.ui.port',0). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse'). \
        config('spark.sql.io.connectionTimeout','6000'). \
        enableHiveSupport(). \
        appName(f'{username} | Data Processing functions'). \
        master('yarn'). \
        getOrCreate()

In [25]:
employees = [(1, "Scott", "Tiger", 1000.0, 10,
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, None,
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, '',
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 10,
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
                ]

In [26]:
employeesDF = spark. \
    createDataFrame(employees,
                    schema="""employee_id INT, first_name STRING, 
                    last_name STRING, salary FLOAT, bonus STRING, nationality STRING,
                    phone_number STRING, ssn STRING"""
                   )

In [27]:
employeesDF.show(truncate=False)

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|nationality   |phone_number    |ssn        |
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|1          |Scott     |Tiger    |1000.0|10   |united states |+1 123 456 7890 |123 45 6789|
|2          |Henry     |Ford     |1250.0|null |India         |+91 234 567 8901|456 78 9123|
|3          |Nick      |Junior   |750.0 |     |united KINGDOM|+44 111 111 1111|222 33 4444|
|4          |Bill      |Gomes    |1500.0|10   |AUSTRALIA     |+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [29]:
from pyspark.sql.functions import coalesce

In [30]:
employeesDF. \
    withColumn('bonus',coalesce('bonus',0)). \
show(truncate=False)

TypeError: Invalid argument, not a string or column: 0 of type <class 'int'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

In [31]:
from pyspark.sql.functions import lit

In [35]:
employeesDF. \
    withColumn('bonus1',coalesce('bonus',lit('0'))). \
show(truncate=False)

+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|employee_id|first_name|last_name|salary|bonus|nationality   |phone_number    |ssn        |bonus1|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|1          |Scott     |Tiger    |1000.0|10   |united states |+1 123 456 7890 |123 45 6789|10    |
|2          |Henry     |Ford     |1250.0|null |India         |+91 234 567 8901|456 78 9123|0     |
|3          |Nick      |Junior   |750.0 |     |united KINGDOM|+44 111 111 1111|222 33 4444|      |
|4          |Bill      |Gomes    |1500.0|10   |AUSTRALIA     |+61 987 654 3210|789 12 6118|10    |
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+



In [34]:
from pyspark.sql.functions import col

In [36]:
employeesDF. \
    withColumn('bonus1', col('bonus').cast('int')). \
show(truncate=False)

+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|employee_id|first_name|last_name|salary|bonus|nationality   |phone_number    |ssn        |bonus1|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|1          |Scott     |Tiger    |1000.0|10   |united states |+1 123 456 7890 |123 45 6789|10    |
|2          |Henry     |Ford     |1250.0|null |India         |+91 234 567 8901|456 78 9123|null  |
|3          |Nick      |Junior   |750.0 |     |united KINGDOM|+44 111 111 1111|222 33 4444|null  |
|4          |Bill      |Gomes    |1500.0|10   |AUSTRALIA     |+61 987 654 3210|789 12 6118|10    |
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+



In [40]:
employeesDF. \
    withColumn('bonus1',coalesce(col('bonus').cast('int'),lit('0'))). \
show(truncate=False)

+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|employee_id|first_name|last_name|salary|bonus|nationality   |phone_number    |ssn        |bonus1|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|1          |Scott     |Tiger    |1000.0|10   |united states |+1 123 456 7890 |123 45 6789|10    |
|2          |Henry     |Ford     |1250.0|null |India         |+91 234 567 8901|456 78 9123|0     |
|3          |Nick      |Junior   |750.0 |     |united KINGDOM|+44 111 111 1111|222 33 4444|0     |
|4          |Bill      |Gomes    |1500.0|10   |AUSTRALIA     |+61 987 654 3210|789 12 6118|10    |
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+



In [41]:
from pyspark.sql.functions import expr

In [42]:
employeesDF. \
    withColumn('bonus1', expr("nvl(bonus,0)")). \
show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|bonus1|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|    10|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|     0|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|      |
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|    10|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+



In [43]:
employeesDF. \
    withColumn('bonus1', expr("nvl(nullif(bonus,''),0)")). \
show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|bonus1|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|    10|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|     0|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|     0|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|    10|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+



In [44]:
employeesDF. \
    withColumn('payment', col('salary')+(col('salary')*coalesce(col('bonus').cast('int'), lit(0))/100)). \
show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+-------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|payment|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+-------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789| 1100.0|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123| 1250.0|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|  750.0|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118| 1650.0|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+-------+



### 183 Using CASE and WHEN 

In [1]:
import getpass

username = getpass.getuser()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession. \
        builder. \
        config('spark.ui.port',0). \
        config('spark.sql.warehouse.dir', f'/user/{username}/warehouse'). \
        config('spark.sql.io.connectionTimeout','6000'). \
        enableHiveSupport(). \
        appName(f'{username} | Data Processing functions'). \
        master('yarn'). \
        getOrCreate()

In [3]:
employees = [(1, "Scott", "Tiger", 1000.0, 10,
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, None,
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, '',
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 10,
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
                ]

In [4]:
employeesDF = spark. \
    createDataFrame(employees,
                    schema="""employee_id INT, first_name STRING, 
                    last_name STRING, salary FLOAT, bonus STRING, nationality STRING,
                    phone_number STRING, ssn STRING"""
                   )

In [5]:
employeesDF.show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [6]:
from pyspark.sql.functions import coalesce, col, lit

In [7]:
employeesDF. \
    withColumn('bonus1', coalesce(col('bonus').cast('int'),lit('0'))). \
show(truncate=False)

+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|employee_id|first_name|last_name|salary|bonus|nationality   |phone_number    |ssn        |bonus1|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|1          |Scott     |Tiger    |1000.0|10   |united states |+1 123 456 7890 |123 45 6789|10    |
|2          |Henry     |Ford     |1250.0|null |India         |+91 234 567 8901|456 78 9123|0     |
|3          |Nick      |Junior   |750.0 |     |united KINGDOM|+44 111 111 1111|222 33 4444|0     |
|4          |Bill      |Gomes    |1500.0|10   |AUSTRALIA     |+61 987 654 3210|789 12 6118|10    |
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+



In [8]:
employeesDF. \
    withColumn('bonus1', coalesce(col('bonus').cast('int'),lit('0'))). \
printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- salary: float (nullable = true)
 |-- bonus: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- ssn: string (nullable = true)
 |-- bonus1: string (nullable = false)



In [9]:
employeesDF. \
    withColumn('bonus1', coalesce(col('bonus').cast('int'),lit(0))). \
printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- salary: float (nullable = true)
 |-- bonus: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- ssn: string (nullable = true)
 |-- bonus1: integer (nullable = false)



In [10]:
employeesDF. \
    withColumn('bonus1', coalesce(col('bonus').cast('int'),lit(0))). \
show(truncate=False)

+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|employee_id|first_name|last_name|salary|bonus|nationality   |phone_number    |ssn        |bonus1|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|1          |Scott     |Tiger    |1000.0|10   |united states |+1 123 456 7890 |123 45 6789|10    |
|2          |Henry     |Ford     |1250.0|null |India         |+91 234 567 8901|456 78 9123|0     |
|3          |Nick      |Junior   |750.0 |     |united KINGDOM|+44 111 111 1111|222 33 4444|0     |
|4          |Bill      |Gomes    |1500.0|10   |AUSTRALIA     |+61 987 654 3210|789 12 6118|10    |
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+



In [11]:
from pyspark.sql.functions import expr

In [12]:
employeesDF. \
    withColumn('bonus1',expr("""
                    CASE WHEN bonus IS NULL OR bonus='' THEN 0
                    ELSE bonus
                    END
                    """)). \
show(truncate=False)

+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|employee_id|first_name|last_name|salary|bonus|nationality   |phone_number    |ssn        |bonus1|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|1          |Scott     |Tiger    |1000.0|10   |united states |+1 123 456 7890 |123 45 6789|10    |
|2          |Henry     |Ford     |1250.0|null |India         |+91 234 567 8901|456 78 9123|0     |
|3          |Nick      |Junior   |750.0 |     |united KINGDOM|+44 111 111 1111|222 33 4444|0     |
|4          |Bill      |Gomes    |1500.0|10   |AUSTRALIA     |+61 987 654 3210|789 12 6118|10    |
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+



In [13]:
from pyspark.sql.functions import when

In [14]:
employeesDF. \
    withColumn('bonus1', when((col('bonus').isNull()) | (col('bonus')==lit('')),0).otherwise(col('bonus'))). \
show(truncate=False)

+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|employee_id|first_name|last_name|salary|bonus|nationality   |phone_number    |ssn        |bonus1|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|1          |Scott     |Tiger    |1000.0|10   |united states |+1 123 456 7890 |123 45 6789|10    |
|2          |Henry     |Ford     |1250.0|null |India         |+91 234 567 8901|456 78 9123|0     |
|3          |Nick      |Junior   |750.0 |     |united KINGDOM|+44 111 111 1111|222 33 4444|0     |
|4          |Bill      |Gomes    |1500.0|10   |AUSTRALIA     |+61 987 654 3210|789 12 6118|10    |
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+



In [15]:
persons = [
    (1, 1),
    (2, 13),
    (3, 18),
    (4, 60),
    (5, 120),
    (6, 0),
    (7, 12),
    (8, 160)
]

In [16]:
personsDF = spark.createDataFrame(persons, schema='id INT, age INT')

In [17]:
personsDF.show()

+---+---+
| id|age|
+---+---+
|  1|  1|
|  2| 13|
|  3| 18|
|  4| 60|
|  5|120|
|  6|  0|
|  7| 12|
|  8|160|
+---+---+



In [18]:
personsDF. \
    withColumn(
        'category',
        expr("""
            CASE
            WHEN age BETWEEN 0 AND 2 THEN 'New Bon'
            WHEN age > 2 AND age <=12 THEN 'Infant'
            WHEN age > 12 AND age <= 48 THEN 'Toddler'
            WHEN age >48  AND age <= 144 THEN 'Kid'
            ELSE 'Teenager or Adult'
            END
        """)
    ). \
show()

+---+---+-----------------+
| id|age|         category|
+---+---+-----------------+
|  1|  1|          New Bon|
|  2| 13|          Toddler|
|  3| 18|          Toddler|
|  4| 60|              Kid|
|  5|120|              Kid|
|  6|  0|          New Bon|
|  7| 12|           Infant|
|  8|160|Teenager or Adult|
+---+---+-----------------+



In [19]:
personsDF. \
    withColumn('categry',
               when(col('age').between(0,2),'New Born').
               when((col('age')>2) & (col('age')<=12),'Infant').
               when((col('age')>12) & (col('age')<=48),'Toddler').
               when((col('age')>48) & (col('age')<=144), 'Kid').
               otherwise('Teenager or Adult')
              ). \
show(truncate=False)

+---+---+-----------------+
|id |age|categry          |
+---+---+-----------------+
|1  |1  |New Born         |
|2  |13 |Toddler          |
|3  |18 |Toddler          |
|4  |60 |Kid              |
|5  |120|Kid              |
|6  |0  |New Born         |
|7  |12 |Infant           |
|8  |160|Teenager or Adult|
+---+---+-----------------+

