In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

In [2]:
data1 = [('ABC', 'DEF', 'GHI'),
('PQR', 'STU', 'VWZ'),
('SMT', 'YUH', 'SGR'),
('SWI', 'FYG', 'LKU')]

In [3]:
data2 = [('HI', 'HELLO', 'HOW'),
('ARE', 'YOU', 'FINE'),
('ETC', 'NO', 'WORRY'),
('SAY', 'YOU', 'ARE')]

In [4]:
df1 = spark.createDataFrame(data1, ["A", "B", "C"])
df1.show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|ABC|DEF|GHI|
|PQR|STU|VWZ|
|SMT|YUH|SGR|
|SWI|FYG|LKU|
+---+---+---+



In [5]:
df2 = spark.createDataFrame(data2, ["A1", "B1", "C1"])
df2.show()

+---+-----+-----+
| A1|   B1|   C1|
+---+-----+-----+
| HI|HELLO|  HOW|
|ARE|  YOU| FINE|
|ETC|   NO|WORRY|
|SAY|  YOU|  ARE|
+---+-----+-----+



In [6]:
df3 = df1.union(df2)

df3.show()

+---+-----+-----+
|  A|    B|    C|
+---+-----+-----+
|ABC|  DEF|  GHI|
|PQR|  STU|  VWZ|
|SMT|  YUH|  SGR|
|SWI|  FYG|  LKU|
| HI|HELLO|  HOW|
|ARE|  YOU| FINE|
|ETC|   NO|WORRY|
|SAY|  YOU|  ARE|
+---+-----+-----+



***************************************
Input:
```
+---+----+
| id|name|
+---+----+
|  1|   a|
|  1|   b|
|  1|   c|
|  1|   d|
|  2|   e|
|  2|   f|
|  2|   g|
+---+----+
```

Output:
```
+---+-----+
| id|names|
+---+-----+
|  1| abcd|
|  2|  efg|
+---+-----+
```

In [7]:
data = [(1,'a'),
(1,'b'),
(1,'c'),
(1,'d'),
(2,'e'),
(2,'f'),
(2,'g')]

In [8]:
df = spark.createDataFrame(data, ["id", "name"])

df.show()

+---+----+
| id|name|
+---+----+
|  1|   a|
|  1|   b|
|  1|   c|
|  1|   d|
|  2|   e|
|  2|   f|
|  2|   g|
+---+----+



In [9]:
from pyspark.sql.functions import *

from pyspark.sql import Window

In [10]:
df2 = df.groupBy("id"). \
    agg(collect_list("name").alias("names"))

In [11]:
df2.show()

+---+------------+
| id|       names|
+---+------------+
|  1|[a, b, c, d]|
|  2|   [e, f, g]|
+---+------------+



In [12]:
df3 = df.groupBy("id"). \
    agg(concat_ws("", collect_list("name")).alias("names"))

In [13]:
df3.show()

+---+-----+
| id|names|
+---+-----+
|  1| dabc|
|  2|  efg|
+---+-----+



***************************************
Input:
```
+-----+-----+-------+
|empno|clmno| status|
+-----+-----+-------+
|  101| clm1|   null|
|  101| clm2|pending|
|  102| clm3| delete|
|  102| clm4|pending|
|  103| clm5|pending|
+-----+-----+-------+
```
Whenever I have null for a empid I need to get only that null record,
Else i need to get all records
```
+-----+-----+-------+
|empno|clmno| status|
+-----+-----+-------+
|  103| clm5|pending|
|  101| clm1|   null|
|  102| clm3| delete|
|  102| clm4|pending|
+-----+-----+-------+
```

In [14]:
df =  spark.createDataFrame([(101,"clm1",None),(101,"clm2","pending"),(102,"clm3","delete"),(102,"clm4","pending"),(103,"clm5","pending")],["empno","clmno","status"])

In [15]:
df.show()

+-----+-----+-------+
|empno|clmno| status|
+-----+-----+-------+
|  101| clm1|   null|
|  101| clm2|pending|
|  102| clm3| delete|
|  102| clm4|pending|
|  103| clm5|pending|
+-----+-----+-------+



In [16]:
df = df.withColumn("status", when(col("status").isNull(), "NA").otherwise(col("status")))

df.show()

+-----+-----+-------+
|empno|clmno| status|
+-----+-----+-------+
|  101| clm1|     NA|
|  101| clm2|pending|
|  102| clm3| delete|
|  102| clm4|pending|
|  103| clm5|pending|
+-----+-----+-------+



In [17]:
_w = Window.partitionBy("empno").orderBy("empno")
df = df.withColumn("array_col", collect_list(col("status")).over(_w))

df.show()

+-----+-----+-------+-----------------+
|empno|clmno| status|        array_col|
+-----+-----+-------+-----------------+
|  103| clm5|pending|        [pending]|
|  101| clm1|     NA|    [NA, pending]|
|  101| clm2|pending|    [NA, pending]|
|  102| clm3| delete|[delete, pending]|
|  102| clm4|pending|[delete, pending]|
+-----+-----+-------+-----------------+



In [18]:
df = df.withColumn("filter_col", array_contains(col("array_col"),"NA"))

df.show()

+-----+-----+-------+-----------------+----------+
|empno|clmno| status|        array_col|filter_col|
+-----+-----+-------+-----------------+----------+
|  103| clm5|pending|        [pending]|     false|
|  101| clm1|     NA|    [NA, pending]|      true|
|  101| clm2|pending|    [NA, pending]|      true|
|  102| clm3| delete|[delete, pending]|     false|
|  102| clm4|pending|[delete, pending]|     false|
+-----+-----+-------+-----------------+----------+



In [19]:
df2 = df.filter((col("filter_col") == False) | (col("filter_col") == True) & (col("status") == "NA")). \
    withColumn("status", when(col("status") == "NA", None).otherwise(col("status"))). \
    drop("array_col", "filter_col")

In [20]:
df2.show()

+-----+-----+-------+
|empno|clmno| status|
+-----+-----+-------+
|  103| clm5|pending|
|  101| clm1|   null|
|  102| clm3| delete|
|  102| clm4|pending|
+-----+-----+-------+

