In [2]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

In [2]:
orders = spark. \
    read. \
    csv('/public/retail_db/orders',
        schema='''
            order_id INT, 
            order_date STRING, 
            order_customer_id INT, 
            order_status STRING
        '''
       )

In [3]:
orders.show(5, False)

+--------+---------------------+-----------------+---------------+
|order_id|order_date           |order_customer_id|order_status   |
+--------+---------------------+-----------------+---------------+
|1       |2013-07-25 00:00:00.0|11599            |CLOSED         |
|2       |2013-07-25 00:00:00.0|256              |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00.0|12111            |COMPLETE       |
|4       |2013-07-25 00:00:00.0|8827             |CLOSED         |
|5       |2013-07-25 00:00:00.0|11318            |COMPLETE       |
+--------+---------------------+-----------------+---------------+
only showing top 5 rows



In [4]:
orders.write.mode("overwrite").save("/user/itv736079/test/data/flights_delay.csv")

In [5]:
%%sh

hdfs dfs -ls /user/itv736079/test/data/flights_delay.csv

Found 2 items
-rw-r--r--   3 itv736079 supergroup          0 2021-10-20 09:44 /user/itv736079/test/data/flights_delay.csv/_SUCCESS
-rw-r--r--   3 itv736079 supergroup     487972 2021-10-20 09:44 /user/itv736079/test/data/flights_delay.csv/part-00000-982e1285-26f8-4ba5-8623-33045bc9847f-c000.snappy.parquet


#### You are given a DataFrame which looks like below.

`
+---+-----+------+----+----------+
| ID|FName| LName| DOB|Department|
+---+-----+------+----+----------+
|101| John| Doe|1977| Software|
|102|David|Turner|1984| Support|
|103|Abdul| Hamid|1978| Account|
+---+-----+------+----+----------+
`

#### You are given a task to transform this DataFrame to the following structure.

`
+---+---------------------+----------+
|ID |PersonalDetails |Department|
+---+---------------------+----------+
|101|[John, Doe, 1977] |Software |
|102|[David, Turner, 1984]|Support |
|103|[Abdul, Hamid, 1978] |Account |
+---+---------------------+----------+
`

In [6]:
data = [(101,'John','Doe',1977,'Software'),
(102,'David','Turner',1984,'Support'),
(103,'Abdul','Hamid',1978,'Account')]

In [7]:
df1 = spark.createDataFrame(data, ['ID', 'FName', 'LName', 'DOB', 'Department'])

In [8]:
df1.show()

+---+-----+------+----+----------+
| ID|FName| LName| DOB|Department|
+---+-----+------+----+----------+
|101| John|   Doe|1977|  Software|
|102|David|Turner|1984|   Support|
|103|Abdul| Hamid|1978|   Account|
+---+-----+------+----+----------+



In [4]:
data = [(101,'John','Doe',None,'Software'),
(102,'David','Turner',None,'Support'),
(103,'Abdul','Hamid',1978,'Account')]

df1 = spark.createDataFrame(data, ['ID', 'FName', 'LName', 'DOB', 'Department'])

df1.na.fill(None)

df1.show()

ValueError: value should be a float, int, long, string, bool or dict

In [9]:
import pyspark.sql.functions as F 

In [10]:
df2 = df1.select("id", F.struct("FName", "LName", "DOB").alias("PersonalDetails"), "Department")

df2.show(truncate=False)

+---+---------------------+----------+
|id |PersonalDetails      |Department|
+---+---------------------+----------+
|101|[John, Doe, 1977]    |Software  |
|102|[David, Turner, 1984]|Support   |
|103|[Abdul, Hamid, 1978] |Account   |
+---+---------------------+----------+



In [11]:
df2 = df1.selectExpr("id", "struct(FName, LName, DOB) as PersonalDetails", "Department")

df2.show(truncate=False)

df2.printSchema()

+---+---------------------+----------+
|id |PersonalDetails      |Department|
+---+---------------------+----------+
|101|[John, Doe, 1977]    |Software  |
|102|[David, Turner, 1984]|Support   |
|103|[Abdul, Hamid, 1978] |Account   |
+---+---------------------+----------+

root
 |-- id: long (nullable = true)
 |-- PersonalDetails: struct (nullable = false)
 |    |-- FName: string (nullable = true)
 |    |-- LName: string (nullable = true)
 |    |-- DOB: long (nullable = true)
 |-- Department: string (nullable = true)



In [12]:
df2.select(F.col("PersonalDetails").getField("FName")).show()

+---------------------+
|PersonalDetails.FName|
+---------------------+
|                 John|
|                David|
|                Abdul|
+---------------------+



In [13]:
df2 = df1.selectExpr("id", "Array(FName, LName, DOB) as PersonalDetails", "Department")

df2.show(truncate=False)

df2.printSchema()

+---+---------------------+----------+
|id |PersonalDetails      |Department|
+---+---------------------+----------+
|101|[John, Doe, 1977]    |Software  |
|102|[David, Turner, 1984]|Support   |
|103|[Abdul, Hamid, 1978] |Account   |
+---+---------------------+----------+

root
 |-- id: long (nullable = true)
 |-- PersonalDetails: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- Department: string (nullable = true)



In [14]:
data = [('X',)]

df = spark.createDataFrame(data, ['dummy'])

df.show()

+-----+
|dummy|
+-----+
|    X|
+-----+



In [15]:
df1 = df.select(F.current_date().alias("today"))

df1.show()

+----------+
|     today|
+----------+
|2021-10-20|
+----------+



In [16]:
df2 = df1.withColumn("week_ago", F.date_sub(F.col("today"), 7))

df2.show()

df2.printSchema()

+----------+----------+
|     today|  week_ago|
+----------+----------+
|2021-10-20|2021-10-13|
+----------+----------+

root
 |-- today: date (nullable = false)
 |-- week_ago: date (nullable = false)



In [17]:
df2 = df1.withColumn("week_ago", F.col("today")- 7)

df2.show()

df2.printSchema()

+----------+----------+
|     today|  week_ago|
+----------+----------+
|2021-10-20|2021-10-13|
+----------+----------+

root
 |-- today: date (nullable = false)
 |-- week_ago: date (nullable = false)



In [18]:
df2 = df1.withColumn("week_ago", F.expr("today - 7"))

df2.show()

df2.printSchema()

+----------+----------+
|     today|  week_ago|
+----------+----------+
|2021-10-20|2021-10-13|
+----------+----------+

root
 |-- today: date (nullable = false)
 |-- week_ago: date (nullable = false)



In [19]:
orders.select("order_id").distinct().agg(F.count("order_id"))

count(order_id)
68883


In [20]:
# orders.selectExpr("countDistinct(order_id)")

# this will fail as countDistinct() function is not available in Spark SQL.

In [21]:
orders.select(F.countDistinct("order_id"))

count(DISTINCT order_id)
68883


In [22]:
orders.selectExpr("count(distinct(order_id))")

count(DISTINCT order_id)
68883


In [23]:
orders.selectExpr("order_id", "order_customer_id", "if(order_customer_id < 5000, order_id * 50 , 0) as increment").show(5)

+--------+-----------------+---------+
|order_id|order_customer_id|increment|
+--------+-----------------+---------+
|       1|            11599|        0|
|       2|              256|      100|
|       3|            12111|        0|
|       4|             8827|        0|
|       5|            11318|        0|
+--------+-----------------+---------+
only showing top 5 rows



In [24]:
orders.select("*", F.expr("order_id * 5 as extend")).show(5)

+--------+--------------------+-----------------+---------------+------+
|order_id|          order_date|order_customer_id|   order_status|extend|
+--------+--------------------+-----------------+---------------+------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|     5|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|    10|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|    15|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|    20|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|    25|
+--------+--------------------+-----------------+---------------+------+
only showing top 5 rows



In [25]:
orders.selectExpr("*", "order_id * 5 as extend").show(5)

+--------+--------------------+-----------------+---------------+------+
|order_id|          order_date|order_customer_id|   order_status|extend|
+--------+--------------------+-----------------+---------------+------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|     5|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|    10|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|    15|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|    20|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|    25|
+--------+--------------------+-----------------+---------------+------+
only showing top 5 rows



In [26]:
data_list = [("Germany", 48, 10),
 ("Germany", 49, 5),
 ("Germany", 50, 3),
 ("Germany", 51, 2),
 ("United Kingdom", 48, 2),
 ("United Kingdom", 49, 2)]
 
df4 = spark.createDataFrame(data_list).toDF("Country", "Week", "Quantity")

df4.show()

+--------------+----+--------+
|       Country|Week|Quantity|
+--------------+----+--------+
|       Germany|  48|      10|
|       Germany|  49|       5|
|       Germany|  50|       3|
|       Germany|  51|       2|
|United Kingdom|  48|       2|
|United Kingdom|  49|       2|
+--------------+----+--------+



#### Calculate 3 weeks running total

In [27]:
from pyspark.sql import Window

running_total_window = Window. \
    partitionBy("Country"). \
    orderBy("Week"). \
    rowsBetween(2, Window.currentRow)
 
df4.withColumn("3WeekTotal", F.sum("Quantity").over(running_total_window)).show()

+--------------+----+--------+----------+
|       Country|Week|Quantity|3WeekTotal|
+--------------+----+--------+----------+
|       Germany|  48|      10|      null|
|       Germany|  49|       5|      null|
|       Germany|  50|       3|      null|
|       Germany|  51|       2|      null|
|United Kingdom|  48|       2|      null|
|United Kingdom|  49|       2|      null|
+--------------+----+--------+----------+



In [28]:
from pyspark.sql import Window

running_total_window = Window. \
    partitionBy("Country"). \
    orderBy("Week"). \
    rowsBetween(-2, Window.currentRow)
 
df4.withColumn("3WeekTotal", F.sum("Quantity").over(running_total_window)).show()

+--------------+----+--------+----------+
|       Country|Week|Quantity|3WeekTotal|
+--------------+----+--------+----------+
|       Germany|  48|      10|        10|
|       Germany|  49|       5|        15|
|       Germany|  50|       3|        18|
|       Germany|  51|       2|        10|
|United Kingdom|  48|       2|         2|
|United Kingdom|  49|       2|         4|
+--------------+----+--------+----------+



In [29]:
df4.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Week: long (nullable = true)
 |-- Quantity: long (nullable = true)



In [30]:
df4.withColumnRenamed('Week3', 'UpdatedWeek').show()

# no error will be thrown even if the column to be renamed does not exist

+--------------+----+--------+
|       Country|Week|Quantity|
+--------------+----+--------+
|       Germany|  48|      10|
|       Germany|  49|       5|
|       Germany|  50|       3|
|       Germany|  51|       2|
|United Kingdom|  48|       2|
|United Kingdom|  49|       2|
+--------------+----+--------+



In [31]:
df4.drop('Week3').show()

# no error will be thrown even if the column to be dropped does not exist

+--------------+----+--------+
|       Country|Week|Quantity|
+--------------+----+--------+
|       Germany|  48|      10|
|       Germany|  49|       5|
|       Germany|  50|       3|
|       Germany|  51|       2|
|United Kingdom|  48|       2|
|United Kingdom|  49|       2|
+--------------+----+--------+



In [32]:
from pyspark.sql.functions import *
df4.distinct()

Country,Week,Quantity
Germany,51,2
Germany,49,5
Germany,48,10
Germany,50,3
United Kingdom,48,2
United Kingdom,49,2


In [33]:
df4.registerTempTable("dfTable")

spark.sql("select * from dfTable").show()

+--------------+----+--------+
|       Country|Week|Quantity|
+--------------+----+--------+
|       Germany|  48|      10|
|       Germany|  49|       5|
|       Germany|  50|       3|
|       Germany|  51|       2|
|United Kingdom|  48|       2|
|United Kingdom|  49|       2|
+--------------+----+--------+



In [34]:
data = [(101,'WHITE HANGING HEART T-LIGHT HOLDER'),
(102,'WHITE LANTERN'),
(103,'RED WOOLLY HOTTIE WHITE HEART')]

df = spark.createDataFrame(data, ['ID','TEXT'])

df.show(truncate=False)

+---+----------------------------------+
|ID |TEXT                              |
+---+----------------------------------+
|101|WHITE HANGING HEART T-LIGHT HOLDER|
|102|WHITE LANTERN                     |
|103|RED WOOLLY HOTTIE WHITE HEART     |
+---+----------------------------------+



In [35]:
df1 = df.select("ID", split(col("TEXT"), " ").alias("VALUES"))

df1.show(truncate=False)

+---+----------------------------------------+
|ID |VALUES                                  |
+---+----------------------------------------+
|101|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|
|102|[WHITE, LANTERN]                        |
|103|[RED, WOOLLY, HOTTIE, WHITE, HEART]     |
+---+----------------------------------------+



In [36]:
df1.selectExpr("ID", "VALUES[0] as V1", "VALUES[1] as V2", "VALUES[2] as V3").show(truncate=False)

+---+-----+-------+------+
|ID |V1   |V2     |V3    |
+---+-----+-------+------+
|101|WHITE|HANGING|HEART |
|102|WHITE|LANTERN|null  |
|103|RED  |WOOLLY |HOTTIE|
+---+-----+-------+------+



In [37]:
df.toLocalIterator()

<generator object _local_iterator_from_socket.<locals>.PyLocalIterable.__iter__ at 0x7f4190c226d0>

In [38]:
list(df.toLocalIterator())

[Row(ID=101, TEXT='WHITE HANGING HEART T-LIGHT HOLDER'),
 Row(ID=102, TEXT='WHITE LANTERN'),
 Row(ID=103, TEXT='RED WOOLLY HOTTIE WHITE HEART')]

In [39]:
data = [(1, 3, 4,  25, 1, lit(None)),
(2, 6, 7,   2, 2, lit(None)),
(3, 3, lit(None),  25, 3, lit(None)),
(4,  lit(None), lit(None),   3, 2, lit(None)),
(5,  lit(None), lit(None),   lit(None), 2, lit(None)),
(6, 3, 2,  25, 2, lit(None))]

In [42]:
# transactionsDf = spark.createDataFrame(data, ['transactionId', 'predError', 'value', 'storeId', 'productId', 'f'])

# # transactionsDf.show()

In [43]:
data = [(1, 3, 4,  25, 1, 55),
(2, 6, 7,   2, 2, 55),
(3, 3, 55,  25, 3, 55),
(4,  55, 55,   3, 2, 55),
(5,  55, 55,   55, 2, 55),
(6, 3, 2,  25, 2, 55)]

transactionsDf = spark.createDataFrame(data, ['transactionId', 'predError', 'value', 'storeId', 'productId', 'f'])

transactionsDf.show()

+-------------+---------+-----+-------+---------+---+
|transactionId|predError|value|storeId|productId|  f|
+-------------+---------+-----+-------+---------+---+
|            1|        3|    4|     25|        1| 55|
|            2|        6|    7|      2|        2| 55|
|            3|        3|   55|     25|        3| 55|
|            4|       55|   55|      3|        2| 55|
|            5|       55|   55|     55|        2| 55|
|            6|        3|    2|     25|        2| 55|
+-------------+---------+-----+-------+---------+---+



In [45]:
transactionsDf.select("transactionId", "predError", "value", "f").show()

+-------------+---------+-----+---+
|transactionId|predError|value|  f|
+-------------+---------+-----+---+
|            1|        3|    4| 55|
|            2|        6|    7| 55|
|            3|        3|   55| 55|
|            4|       55|   55| 55|
|            5|       55|   55| 55|
|            6|        3|    2| 55|
+-------------+---------+-----+---+



In [46]:
transactionsDf.select(["transactionId", "predError", "value", "f"]).show()

+-------------+---------+-----+---+
|transactionId|predError|value|  f|
+-------------+---------+-----+---+
|            1|        3|    4| 55|
|            2|        6|    7| 55|
|            3|        3|   55| 55|
|            4|       55|   55| 55|
|            5|       55|   55| 55|
|            6|        3|    2| 55|
+-------------+---------+-----+---+

