In [1]:
import os
import sys
from pyspark.sql import SparkSession

os.environ["HADOOP_HOME"] = "C:\\Users\\SkJain\\Downloads\\Compressed\\winutils-master\\hadoop-3.2.2"
sys.path.append('C:\\Users\\SkJain\\Downloads\\Compressed\\winutils-master\\hadoop-3.2.2\\bin')

In [2]:
spark = SparkSession. \
    builder. \
    config("spark.ui.port", "0"). \
    enableHiveSupport(). \
    appName('SparkSql'). \
    master('local'). \
    getOrCreate()

## List of function and see how to use them

In [6]:
spark.sql("SHOW FUNCTIONS").show(300, False)

In [5]:
spark.sql("DESCRIBE FUNCTION substr").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|function_desc                                                                                                                                                                                                                                                                                                                                                                          |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Validating functions

- SparkSql follows mysql style. 
- We can simply write queries like SELECT current_date; or SELECT SUBSTR('Hello World',1,5)
- if we want to do it oracle style we can create a dummy table named 'dual' with one column named dummy and insert one value into it 'X'
- then we can write our queries like: "SELECT SUBSTR('Hello World',1,5) result FROM dual"

In [7]:
spark.sql("SELECT current_date").show()

+--------------+
|current_date()|
+--------------+
|    2022-04-14|
+--------------+



In [9]:
spark.sql("SELECT SUBSTR('Hello World',1,5) result").show()

+------+
|result|
+------+
| Hello|
+------+



## String Manipulation functions
- case conversion: lower, upper, initcap
- size of column value: length
- extracting data: substr, split (substr and substring both do the same thing)
- triminng and padding: trim, rtrim, ltrim
    - removes extra white spaces from both sides, right side and left side respectively
- padding: rpad, lpad
    - selct lpad(column, desired_length, character_to_pad on left)
    - if column is less than of desired length, then character_to_pad is added until desired length is achieved
    - if length of column is same as desired length then nothing happens
    - if length is more than lpad will trim characters from right
- reverse string: reverse
- concatenate multiple strings: concat and concat_ws 
    - ws stands for 'with separator'
    - used when we want to concatenate multiple string but with same separator in between (eg. space for name or hyphen for date)
    - first value in concat_ws is the separator followed by actual strings
- explode (to convert one record into multiple records) : eg SELECT explode(split('2013-07-25', '-'))
    - above query will create 3 rows with values 2013, 07,25

## Date Manipulation functions

### Getting current date or current timestamp
    - both these functions are not listed in SHOW FUNCTIONS, but DESCRIBE still works
    - dates are nothing but special string, so all string related functions can be used on them

In [13]:
spark.sql("SELECT current_date").show() #default format: yyyy-MM-dd

+--------------+
|current_date()|
+--------------+
|    2022-04-14|
+--------------+



In [14]:
spark.sql("SELECT current_timestamp").show(truncate=False) #default format: yyyy-MM-dd HH:mm:ss.SSS

+-----------------------+
|current_timestamp()    |
+-----------------------+
|2022-04-14 20:43:32.863|
+-----------------------+



### Date Arithmetic
- date_add
- date_sub
- datediff
- add_months

In [15]:
spark.sql("SELECT date_add(current_date, 10)").show(truncate=False)

+----------------------------+
|date_add(current_date(), 10)|
+----------------------------+
|2022-04-24                  |
+----------------------------+



In [16]:
spark.sql("SELECT date_add(current_date, 365)").show(truncate=False)

+-----------------------------+
|date_add(current_date(), 365)|
+-----------------------------+
|2023-04-14                   |
+-----------------------------+



In [17]:
spark.sql("SELECT date_add(current_date, -365)").show(truncate=False)

+------------------------------+
|date_add(current_date(), -365)|
+------------------------------+
|2021-04-14                    |
+------------------------------+



In [18]:
spark.sql("SELECT date_sub(current_date, 365)").show(truncate=False)

+-----------------------------+
|date_sub(current_date(), 365)|
+-----------------------------+
|2021-04-14                   |
+-----------------------------+



In [20]:
spark.sql("SELECT datediff('2022-04-14', '2022-01-01')").show(truncate=False)

+------------------------------------------------------------+
|datediff(CAST(2022-04-14 AS DATE), CAST(2022-01-01 AS DATE))|
+------------------------------------------------------------+
|103                                                         |
+------------------------------------------------------------+



In [21]:
spark.sql("SELECT add_months('2022-04-20', 9)").show(truncate=False)

+---------------------------------------+
|add_months(CAST(2022-04-20 AS DATE), 9)|
+---------------------------------------+
|2023-01-20                             |
+---------------------------------------+



In [23]:
# if we do add month on a border date, it will simply give last date of next month
spark.sql("SELECT add_months('2022-01-31', 1)").show(truncate=False)

+---------------------------------------+
|add_months(CAST(2022-01-31 AS DATE), 1)|
+---------------------------------------+
|2022-02-28                             |
+---------------------------------------+



### Begining date or time
- trunc()
    - works on both dates and timestamps
    - can be only used to get begining month or year 
    - use MM for month and YY for year
    - anything except MM and YY will return null
- date_trunc()
    - get begining time up tp seconds
    - takes format first unlike trunc()

In [25]:
spark.sql("SELECT trunc(current_date, 'MM')").show(truncate=False)

+-------------------------+
|trunc(current_date(), MM)|
+-------------------------+
|2022-04-01               |
+-------------------------+



In [26]:
spark.sql("SELECT trunc(current_date, 'YY')").show(truncate=False)

+-------------------------+
|trunc(current_date(), YY)|
+-------------------------+
|2022-01-01               |
+-------------------------+



In [33]:
spark.sql("DESCRIBE FUNCTION date_trunc").show(200, truncate=False)

+----------------------------------------------------------------------------------------------------------------------+
|function_desc                                                                                                         |
+----------------------------------------------------------------------------------------------------------------------+
|Function: date_trunc                                                                                                  |
|Class: org.apache.spark.sql.catalyst.expressions.TruncTimestamp                                                       |
|Usage: 
    date_trunc(fmt, ts) - Returns timestamp `ts` truncated to the unit specified by the format model `fmt`.
  |
+----------------------------------------------------------------------------------------------------------------------+



In [37]:
spark.sql("SELECT date_trunc('HOUR',current_timestamp)").show(truncate=False)

+-------------------------------------+
|date_trunc(HOUR, current_timestamp())|
+-------------------------------------+
|2022-04-15 11:00:00                  |
+-------------------------------------+



In [38]:
spark.sql("SELECT date_trunc('DAY',current_timestamp)").show(truncate=False)

+------------------------------------+
|date_trunc(DAY, current_timestamp())|
+------------------------------------+
|2022-04-15 00:00:00                 |
+------------------------------------+



### Extracting info using date_format

In [39]:
spark.sql("DESCRIBE FUNCTION date_format").show(200, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------+
|function_desc                                                                                                                   |
+--------------------------------------------------------------------------------------------------------------------------------+
|Function: date_format                                                                                                           |
|Class: org.apache.spark.sql.catalyst.expressions.DateFormatClass                                                                |
|Usage: date_format(timestamp, fmt) - Converts `timestamp` to a value of string in the format specified by the date format `fmt`.|
+--------------------------------------------------------------------------------------------------------------------------------+



In [40]:
spark.sql("SELECT current_timestamp, date_format(current_timestamp, 'yyyy')").show(truncate=False)

+-----------------------+---------------------------------------------------------------+
|current_timestamp()    |date_format(current_timestamp() AS `current_timestamp()`, yyyy)|
+-----------------------+---------------------------------------------------------------+
|2022-04-15 11:24:08.091|2022                                                           |
+-----------------------+---------------------------------------------------------------+



In [41]:
spark.sql("SELECT current_timestamp, date_format(current_timestamp, 'yy')").show(truncate=False)

+-----------------------+-------------------------------------------------------------+
|current_timestamp()    |date_format(current_timestamp() AS `current_timestamp()`, yy)|
+-----------------------+-------------------------------------------------------------+
|2022-04-15 11:24:23.171|22                                                           |
+-----------------------+-------------------------------------------------------------+



In [43]:
spark.sql("SELECT current_timestamp, date_format(current_timestamp, 'MM')").show(truncate=False)

+-----------------------+-------------------------------------------------------------+
|current_timestamp()    |date_format(current_timestamp() AS `current_timestamp()`, MM)|
+-----------------------+-------------------------------------------------------------+
|2022-04-15 11:24:40.108|04                                                           |
+-----------------------+-------------------------------------------------------------+



In [44]:
spark.sql("SELECT current_timestamp, date_format(current_timestamp, 'dd')").show(truncate=False)

+-----------------------+-------------------------------------------------------------+
|current_timestamp()    |date_format(current_timestamp() AS `current_timestamp()`, dd)|
+-----------------------+-------------------------------------------------------------+
|2022-04-15 11:24:49.087|15                                                           |
+-----------------------+-------------------------------------------------------------+



In [49]:
spark.sql("SELECT current_timestamp, date_format(current_timestamp, 'MMM')").show(truncate=False)

+-----------------------+--------------------------------------------------------------+
|current_timestamp()    |date_format(current_timestamp() AS `current_timestamp()`, MMM)|
+-----------------------+--------------------------------------------------------------+
|2022-04-15 11:27:52.166|Apr                                                           |
+-----------------------+--------------------------------------------------------------+



In [50]:
spark.sql("SELECT current_timestamp, date_format(current_timestamp, 'MMMM')").show(truncate=False)

+-----------------------+---------------------------------------------------------------+
|current_timestamp()    |date_format(current_timestamp() AS `current_timestamp()`, MMMM)|
+-----------------------+---------------------------------------------------------------+
|2022-04-15 11:28:05.646|April                                                          |
+-----------------------+---------------------------------------------------------------+



In [51]:
spark.sql("SELECT current_timestamp, date_format(current_timestamp, 'EE')").show(truncate=False)

+-----------------------+-------------------------------------------------------------+
|current_timestamp()    |date_format(current_timestamp() AS `current_timestamp()`, EE)|
+-----------------------+-------------------------------------------------------------+
|2022-04-15 11:28:14.311|Fri                                                          |
+-----------------------+-------------------------------------------------------------+



In [52]:
spark.sql("SELECT current_timestamp, date_format(current_timestamp, 'EEEE')").show(truncate=False)

+-----------------------+---------------------------------------------------------------+
|current_timestamp()    |date_format(current_timestamp() AS `current_timestamp()`, EEEE)|
+-----------------------+---------------------------------------------------------------+
|2022-04-15 11:28:29.638|Friday                                                         |
+-----------------------+---------------------------------------------------------------+



In [55]:
spark.sql("SELECT current_timestamp, date_format(current_timestamp, 'HH') as 24HrFormat").show(truncate=False)

+-----------------------+----------+
|current_timestamp()    |24HrFormat|
+-----------------------+----------+
|2022-04-15 11:29:16.483|11        |
+-----------------------+----------+



In [56]:
spark.sql("SELECT current_timestamp, date_format(current_timestamp, 'hh') as 12HrFormat").show(truncate=False)

+-----------------------+----------+
|current_timestamp()    |12HrFormat|
+-----------------------+----------+
|2022-04-15 11:29:16.785|11        |
+-----------------------+----------+



In [57]:
spark.sql("SELECT current_timestamp, date_format(current_timestamp, 'mm')").show(truncate=False)

+-----------------------+-------------------------------------------------------------+
|current_timestamp()    |date_format(current_timestamp() AS `current_timestamp()`, mm)|
+-----------------------+-------------------------------------------------------------+
|2022-04-15 11:29:30.299|29                                                           |
+-----------------------+-------------------------------------------------------------+



In [58]:
spark.sql("SELECT current_timestamp, date_format(current_timestamp, 'ss')").show(truncate=False)

+-----------------------+-------------------------------------------------------------+
|current_timestamp()    |date_format(current_timestamp() AS `current_timestamp()`, ss)|
+-----------------------+-------------------------------------------------------------+
|2022-04-15 11:29:37.915|37                                                           |
+-----------------------+-------------------------------------------------------------+



In [59]:
spark.sql("SELECT current_timestamp, date_format(current_timestamp, 'SS')").show(truncate=False)

+-----------------------+-------------------------------------------------------------+
|current_timestamp()    |date_format(current_timestamp() AS `current_timestamp()`, SS)|
+-----------------------+-------------------------------------------------------------+
|2022-04-15 11:29:51.431|43                                                           |
+-----------------------+-------------------------------------------------------------+



In [61]:
#convert to a diffrent style of writing date
spark.sql("""SELECT current_timestamp, 
          date_format(current_timestamp ,'EEEE dd/MM/yy')""").show(truncate=False)

+-----------------------+------------------------------------------------------------------------+
|current_timestamp()    |date_format(current_timestamp() AS `current_timestamp()`, EEEE dd/MM/yy)|
+-----------------------+------------------------------------------------------------------------+
|2022-04-15 11:33:03.374|Friday 15/04/22                                                         |
+-----------------------+------------------------------------------------------------------------+



### Calendar functions
- get values such as day, dayOfMonth, month, weekOfYear, year etc.

In [63]:
spark.sql("DESCRIBE FUNCTION day").show(truncate=False)

+------------------------------------------------------------------+
|function_desc                                                     |
+------------------------------------------------------------------+
|Function: day                                                     |
|Class: org.apache.spark.sql.catalyst.expressions.DayOfMonth       |
|Usage: day(date) - Returns the day of month of the date/timestamp.|
+------------------------------------------------------------------+



In [64]:
spark.sql("SELECT current_timestamp, day(current_timestamp)").show(truncate=False)

+-----------------------+---------------------------------------------------------------+
|current_timestamp()    |day(CAST(current_timestamp() AS `current_timestamp()` AS DATE))|
+-----------------------+---------------------------------------------------------------+
|2022-04-15 11:35:39.669|15                                                             |
+-----------------------+---------------------------------------------------------------+



In [65]:
spark.sql("SELECT current_timestamp, year(current_timestamp)").show(truncate=False)

+-----------------------+----------------------------------------------------------------+
|current_timestamp()    |year(CAST(current_timestamp() AS `current_timestamp()` AS DATE))|
+-----------------------+----------------------------------------------------------------+
|2022-04-15 11:36:08.016|2022                                                            |
+-----------------------+----------------------------------------------------------------+



In [66]:
spark.sql("SELECT current_timestamp, month(current_timestamp)").show(truncate=False)

+-----------------------+-----------------------------------------------------------------+
|current_timestamp()    |month(CAST(current_timestamp() AS `current_timestamp()` AS DATE))|
+-----------------------+-----------------------------------------------------------------+
|2022-04-15 11:36:13.323|4                                                                |
+-----------------------+-----------------------------------------------------------------+



In [67]:
spark.sql("SELECT current_timestamp, weekofyear(current_timestamp)").show(truncate=False)

+-----------------------+----------------------------------------------------------------------+
|current_timestamp()    |weekofyear(CAST(current_timestamp() AS `current_timestamp()` AS DATE))|
+-----------------------+----------------------------------------------------------------------+
|2022-04-15 11:36:21.758|15                                                                    |
+-----------------------+----------------------------------------------------------------------+



In [68]:
spark.sql("SELECT current_timestamp, dayofmonth(current_timestamp)").show(truncate=False)

+-----------------------+----------------------------------------------------------------------+
|current_timestamp()    |dayofmonth(CAST(current_timestamp() AS `current_timestamp()` AS DATE))|
+-----------------------+----------------------------------------------------------------------+
|2022-04-15 11:36:52.437|15                                                                    |
+-----------------------+----------------------------------------------------------------------+



### Dealing with unix timestamp
- unix timestamp is an intger value which started somewhere in 1970 and is incremented every second

In [70]:
spark.sql("SELECT current_timestamp, to_unix_timestamp(current_timestamp)").show(truncate=False)

+-----------------------+------------------------------------------------------------------------------------+
|current_timestamp()    |to_unix_timestamp(current_timestamp() AS `current_timestamp()`, yyyy-MM-dd HH:mm:ss)|
+-----------------------+------------------------------------------------------------------------------------+
|2022-04-15 11:39:40.631|1650002980                                                                          |
+-----------------------+------------------------------------------------------------------------------------+



In [72]:
spark.sql("SELECT current_timestamp, from_unixtime(1650004000)").show(truncate=False)

+-----------------------+--------------------------------------------------------------+
|current_timestamp()    |from_unixtime(CAST(1650004000 AS BIGINT), yyyy-MM-dd HH:mm:ss)|
+-----------------------+--------------------------------------------------------------+
|2022-04-15 11:40:20.784|2022-04-15 11:56:40                                           |
+-----------------------+--------------------------------------------------------------+



In [74]:
spark.sql("SELECT current_timestamp, from_unixtime(1650004000, 'yyyy-MM')").show(truncate=False)

+----------------------+--------------------------------------------------+
|current_timestamp()   |from_unixtime(CAST(1650004000 AS BIGINT), yyyy-MM)|
+----------------------+--------------------------------------------------+
|2022-04-15 11:42:31.98|2022-04                                           |
+----------------------+--------------------------------------------------+



## Numeric Functions

- abs
- sum, avg
- round
    - we can round to nearest integer (based on greater or less than .5), if we only provide col_name/ data as parameter
    - we can round to certain precision if we give a second parameter (eg. round(12.587,1) will give 12.6)
- ceil, floor
- greatest
- min, max
- rand (to generate a random number)
    - eg. select rand() as random_number
    - by default it gives between 0 and 1
    - if we want to get an integer between 1 and 100 we can write: select cast(round(rand()*100), int) 
    - if we want only 0 or 1: select cast(round(rand()*1), int)
- pow, sqrt
- cumedist, stddev, variance

## Data type conversion
- if cannot be casted then it return null (will not throw any error)
- CAST(col_name as INT)
- CAST(col_name as FLOAT)
- CAST(col_name as TIMESTAMP)

## Dealing With Nulls
- any operation on null will return null
- we can use nvl or coalesce
- nvl takes 2 argument column/data and the value to replace with if null (for eg. if it's int we can replace with 0)
- coalesce takes multiple values and considers first non null value
- nvl2 can be used to perform one action if value is not null and another if it is. It will take 3 args: data, notnull value/formula, null case value/formula

## Case When Else Statement
syntax:  
``` 
CASE  
    WHEN condition1 THEN value1  
    WHEN condition2 THEN value2  
    WHEN conditionN THEN valueN  
    ELSE defaultValue  
END AS column_name
```  
     
- if else not given after cases then all values which do not match the condition will be set to null

## Assignment

In [79]:
spark.sql("SELECT current_database()").show(truncate=False)

+------------------+
|current_database()|
+------------------+
|default           |
+------------------+



In [80]:
spark.sql("SHOW DATABASES").show(truncate=False)

+----------+
|namespace |
+----------+
|default   |
|nysedb    |
|siddhantdb|
+----------+



In [81]:
spark.sql("USE nysedb").show(truncate=False)

++
||
++
++



In [82]:
spark.sql("SHOW TABLES").show(truncate=False)

+--------+-------------+-----------+
|database|tableName    |isTemporary|
+--------+-------------+-----------+
|nysedb  |nyse_eod     |false      |
|nysedb  |nyse_eod_part|false      |
+--------+-------------+-----------+



In [84]:
create_lines_table = """ 
CREATE TABLE IF NOT EXISTS LINES (
 s String
)
"""
spark.sql(create_lines_table)

DataFrame[]

In [85]:
spark.sql("SHOW TABLES").show(truncate=False)

+--------+-------------+-----------+
|database|tableName    |isTemporary|
+--------+-------------+-----------+
|nysedb  |lines        |false      |
|nysedb  |nyse_eod     |false      |
|nysedb  |nyse_eod_part|false      |
+--------+-------------+-----------+



In [86]:
word_count_data_path = './datasets/wordCountOnSparkDocuemntation.txt'
load_data_query = f"LOAD DATA LOCAL INPATH '{word_count_data_path}' INTO TABLE lines"
spark.sql(load_data_query)

DataFrame[]

In [87]:
spark.sql("SELECT * FROM lines LIMIT 10").show()

+--------------------+
|                   s|
+--------------------+
|      Spark Overview|
|Apache Spark is a...|
|                    |
|         Downloading|
|Get Spark from th...|
|                    |
|If you’d like to ...|
|                    |
|Spark runs on bot...|
|                    |
+--------------------+



In [91]:
spark.sql("SELECT count(*) FROM lines").show()

+--------+
|count(1)|
+--------+
|      94|
+--------+



In [89]:
spark.sql("DESCRIBE  FUNCTION split").show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------+
|function_desc                                                                                                                             |
+------------------------------------------------------------------------------------------------------------------------------------------+
|Function: split                                                                                                                           |
|Class: org.apache.spark.sql.catalyst.expressions.StringSplit                                                                              |
|Usage: split(str, regex, limit) - Splits `str` around occurrences that match `regex` and returns an array with a length of at most `limit`|
+------------------------------------------------------------------------------------------------------------------------------------------+



In [90]:
spark.sql("SELECT split(s, ' ') as words FROM lines LIMIT 10").show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|words                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [92]:
spark.sql("SELECT count(split(s, ' ')) as words FROM lines").show(truncate=False)

+-----+
|words|
+-----+
|94   |
+-----+



In [93]:
spark.sql("SELECT explode(split(s, ' ')) as words FROM lines LIMIT 10").show(truncate=False)

+---------+
|words    |
+---------+
|Spark    |
|Overview |
|Apache   |
|Spark    |
|is       |
|a        |
|unified  |
|analytics|
|engine   |
|for      |
+---------+



In [100]:
spark.sql("SELECT COUNT(*) from (SELECT explode(split(s, ' ')) as words FROM lines)").show(truncate=False)

+--------+
|count(1)|
+--------+
|944     |
+--------+



In [104]:
spark.sql("SELECT words, COUNT(*) as word_freq from (SELECT explode(split(s, ' ')) as words FROM lines) group by words order by word_freq desc").show(truncate=False)

+-------+---------+
|words  |word_freq|
+-------+---------+
|Spark  |51       |
|and    |26       |
|a      |23       |
|the    |21       |
|       |19       |
|for    |16       |
|to     |16       |
|of     |16       |
|in     |14       |
|on     |13       |
|Python |12       |
|run    |11       |
|with   |10       |
|cluster|9        |
|API    |8        |
|Scala  |8        |
|also   |8        |
|data   |7        |
|is     |7        |
|For    |7        |
+-------+---------+
only showing top 20 rows



In [105]:
spark.sql("SELECT distinct words from (SELECT explode(split(s, ' ')) as words FROM lines)").show(truncate=False)

+-------------+
|words        |
+-------------+
|installation.|
|online       |
|(Scala,      |
|include      |
|graphs       |
|launch       |
|Compatibility|
|(core        |
|2.12/2.13,   |
|(Scaladoc)   |
|Functions    |
|If           |
|—            |
|API          |
|EC2          |
|documentation|
|(e.g.        |
|It’s         |
|specifies    |
|introduction |
+-------------+
only showing top 20 rows



In [106]:
spark.sql("SELECT count(distinct words) from (SELECT explode(split(s, ' ')) as words FROM lines)").show(truncate=False)

+---------------------+
|count(DISTINCT words)|
+---------------------+
|477                  |
+---------------------+

