In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

In [2]:
nullDF = spark.createDataFrame([(11, "Deepak", 100),
                               (12, "Leena", None),
                               (13, "Deepa", 102),
                               (15, "Preety", None)], ["emp id", "emp name", "emp salary"])

nullDF.show()

+------+--------+----------+
|emp id|emp name|emp salary|
+------+--------+----------+
|    11|  Deepak|       100|
|    12|   Leena|      null|
|    13|   Deepa|       102|
|    15|  Preety|      null|
+------+--------+----------+



In [3]:
cols = nullDF.columns
cols

['emp id', 'emp name', 'emp salary']

In [4]:
[i.title() for i in cols]

['Emp Id', 'Emp Name', 'Emp Salary']

In [5]:
titleCaseCols = [''.join(j for j in i.title() if not j.isspace()) for i in cols]

In [6]:
titleCaseCols

['EmpId', 'EmpName', 'EmpSalary']

In [7]:
camleCaseCols = [column[0].lower()+column[1:] for column in titleCaseCols]

In [8]:
camleCaseCols

['empId', 'empName', 'empSalary']

In [9]:
nullDF = nullDF.toDF(*camleCaseCols)

nullDF.show()

+-----+-------+---------+
|empId|empName|empSalary|
+-----+-------+---------+
|   11| Deepak|      100|
|   12|  Leena|     null|
|   13|  Deepa|      102|
|   15| Preety|     null|
+-----+-------+---------+



https://www.linkedin.com/company/justenough-spark/posts/?feedView=images

In [10]:
from pyspark.sql.functions import *

nullDF.show()

nonNullDF = nullDF.filter(isnull("empSalary"))

nonNullDF.show()

+-----+-------+---------+
|empId|empName|empSalary|
+-----+-------+---------+
|   11| Deepak|      100|
|   12|  Leena|     null|
|   13|  Deepa|      102|
|   15| Preety|     null|
+-----+-------+---------+

+-----+-------+---------+
|empId|empName|empSalary|
+-----+-------+---------+
|   12|  Leena|     null|
|   15| Preety|     null|
+-----+-------+---------+



In [11]:
nonNullDF = nullDF.filter(col("empSalary").isNull())

nonNullDF.show()

+-----+-------+---------+
|empId|empName|empSalary|
+-----+-------+---------+
|   12|  Leena|     null|
|   15| Preety|     null|
+-----+-------+---------+



In [12]:
df = spark.range(10)

df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



In [13]:
df = spark.createDataFrame([(1, 'Subhayan'), (2, 'Ankana')])

df.show()

+---+--------+
| _1|      _2|
+---+--------+
|  1|Subhayan|
|  2|  Ankana|
+---+--------+



In [14]:
spark.conf.get("spark.sql.files.maxPartitionBytes")

'134217728b'

In [15]:
# spark.conf.get("spark.driver.maxResultSize")

In [16]:
d = display(df)

type(d)

_1,_2
1,Subhayan
2,Ankana


NoneType

In [17]:
spark.sql("SELECT flatten(array(array(1, 2), array(3, 4)))").show()

+----------------------------------------+
|flatten(array(array(1, 2), array(3, 4)))|
+----------------------------------------+
|                            [1, 2, 3, 4]|
+----------------------------------------+



In [18]:
spark.sql("SELECT array(array(1, 2), array(3, 4))").show()

+-------------------------------+
|array(array(1, 2), array(3, 4))|
+-------------------------------+
|               [[1, 2], [3, 4]]|
+-------------------------------+



In [19]:
spark.sql("SELECT floor(5.9) AS floored").show()

+-------+
|floored|
+-------+
|      5|
+-------+



In [20]:
spark.sql("SELECT ceil(5.9) AS ceiled").show()

+------+
|ceiled|
+------+
|     6|
+------+



In [21]:
spark.sql("SELECT trim('    SparkSQL   '), \
       trim('SL', 'SSparkSQLS'), \
       trim(BOTH 'SL' FROM 'SSparkSQLS'), \
       trim(LEADING 'SL' FROM 'SSparkSQLS'), \
       trim(TRAILING 'SL' FROM 'SSparkSQLS')").show()

+---------------------+-----------------------------+-----------------------------+--------------------------------+---------------------------------+
|trim(    SparkSQL   )|TRIM(BOTH SL FROM SSparkSQLS)|TRIM(BOTH SL FROM SSparkSQLS)|TRIM(LEADING SL FROM SSparkSQLS)|TRIM(TRAILING SL FROM SSparkSQLS)|
+---------------------+-----------------------------+-----------------------------+--------------------------------+---------------------------------+
|             SparkSQL|                       parkSQ|                       parkSQ|                        parkSQLS|                         SSparkSQ|
+---------------------+-----------------------------+-----------------------------+--------------------------------+---------------------------------+



In [22]:
from pyspark.sql.types import *
from pyspark.sql import Window

schema = StructType([
  StructField("letter", StringType(), True),
  StructField("position", IntegerType(), True)])

df = spark.createDataFrame([("a", 10), ("a", 10), ("a", 20)], schema)
windowSpec = Window.partitionBy("letter").orderBy("position")

df.show()

+------+--------+
|letter|position|
+------+--------+
|     a|      10|
|     a|      10|
|     a|      20|
+------+--------+



In [23]:
df.withColumn("rank", rank().over(windowSpec)) \
  .withColumn("dense_rank", dense_rank().over(windowSpec)) \
  .withColumn("row_number", row_number().over(windowSpec)).show()

+------+--------+----+----------+----------+
|letter|position|rank|dense_rank|row_number|
+------+--------+----+----------+----------+
|     a|      10|   1|         1|         1|
|     a|      10|   1|         1|         2|
|     a|      20|   3|         2|         3|
+------+--------+----+----------+----------+



In [24]:
data_list = [("Ravi", "28", "3200"),
 ("Abdul", "23", "4800"),
 ("John", "32", "6500"),
 ("Rosy", "48", "8200")]
 
df = spark.createDataFrame(data_list).toDF("name", "age", "salary")

df.show()

+-----+---+------+
| name|age|salary|
+-----+---+------+
| Ravi| 28|  3200|
|Abdul| 23|  4800|
| John| 32|  6500|
| Rosy| 48|  8200|
+-----+---+------+



In [25]:
df.select("name", col("salary") * 20).show()

+-----+-------------+
| name|(salary * 20)|
+-----+-------------+
| Ravi|      64000.0|
|Abdul|      96000.0|
| John|     130000.0|
| Rosy|     164000.0|
+-----+-------------+



In [26]:
df.select("name", expr("salary") * 20).show()

+-----+-------------+
| name|(salary * 20)|
+-----+-------------+
| Ravi|      64000.0|
|Abdul|      96000.0|
| John|     130000.0|
| Rosy|     164000.0|
+-----+-------------+



In [27]:
data = [('X',)]

df = spark.createDataFrame(data, ['dummy'])

df.show()

+-----+
|dummy|
+-----+
|    X|
+-----+



In [29]:
df1 = df.select(current_date().alias("today"))

df1.show()

+----------+
|     today|
+----------+
|2021-10-22|
+----------+



In [31]:
df1.printSchema()

root
 |-- today: date (nullable = false)



In [30]:
df2 = df1.withColumn("week_later", date_add("today", 7))

df2.show()

+----------+----------+
|     today|week_later|
+----------+----------+
|2021-10-22|2021-10-29|
+----------+----------+



In [32]:
df2 = df1.withColumn("week_later", date_add(to_date("today", "dd-MM-yyyy"), 7))

df2.show()

+----------+----------+
|     today|week_later|
+----------+----------+
|2021-10-22|2021-10-29|
+----------+----------+



----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 40604)
Traceback (most recent call last):
  File "/opt/anaconda3/envs/beakerx/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/anaconda3/envs/beakerx/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/opt/anaconda3/envs/beakerx/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/anaconda3/envs/beakerx/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/pyspark/accumulato