# Functions

+ <a href="#functions">1.Built In Functions</a>
    + <a href="#string">String Functions</a>
    + <a href="#numeric">Numeric Functions</a>
    + <a href="#date">Date Functions</a>
+ <a href="#dates">2.Working with Dates</a>
+ <a href="#user">3.User Defined Functions</a>
+ <a href="#join">4.Working with Joins</a>
+ <a href="#challenges">5.Challenges</a>
----

# Set up

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
spark

---

In [3]:
# preprocessed crimes data
from pyspark.sql.functions import to_timestamp, col, lit

In [4]:
rc = spark.read.csv('../data/chicago_crimes.csv', header=True).withColumn('Date', to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a')).filter(col('Date') <= lit('2018-11-11'))
rc.show(5)

+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|               Date|               Block|IUCR|Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|10224738|   HY411648|2015-09-05 13:30:00|     043XX S WOOD ST|0486|     BATTERY|DOMESTIC BATTERY ...|           RESIDENCE| false|    true|0924|     00

------

<p id="functions"></p>

# 1) Built In Functions

In [5]:
from pyspark.sql import functions

In [8]:
# all available buit in functions
dir(functions)

['Column',
 'DataFrame',
 'DataType',
 'PandasUDFType',
 'PythonEvalType',
 'SparkContext',
 'StringType',
 'UserDefinedFunction',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_binary_mathfunctions',
 '_collect_list_doc',
 '_collect_set_doc',
 '_create_binary_mathfunction',
 '_create_column_from_literal',
 '_create_column_from_name',
 '_create_function',
 '_create_function_over_column',
 '_create_udf',
 '_create_window_function',
 '_functions',
 '_functions_1_4_over_column',
 '_functions_1_6_over_column',
 '_functions_2_1_over_column',
 '_functions_2_4',
 '_functions_deprecated',
 '_functions_over_column',
 '_lit_doc',
 '_message',
 '_options_to_str',
 '_string_functions',
 '_test',
 '_to_java_column',
 '_to_seq',
 '_window_functions',
 '_wrap_deprecated_function',
 'abs',
 'acos',
 'add_months',
 'approxCountDistinct',
 'approx_count_distinct',
 'array',
 'array_contains',
 'array_distinct',
 'array_exc

-------

<p id="string"></p>

## String Functions

### Display the Primary Type column in lower and upper characters, and the first 4 characters of the column

In [11]:
from pyspark.sql.functions import lower, upper, substring

**NOTE: for substring Position is 1 based, not 0 based (not start from 0 like other languages)**

In [12]:
# help(substring)

Help on function substring in module pyspark.sql.functions:

substring(str, pos, len)
    Substring starts at `pos` and is of length `len` when str is String type or
    returns the slice of byte array that starts at `pos` in byte and is of length `len`
    when str is Binary type.
    
    .. note:: The position is not zero based, but 1 based index.
    
    >>> df = spark.createDataFrame([('abcd',)], ['s',])
    >>> df.select(substring(df.s, 1, 2).alias('s')).collect()
    [Row(s='ab')]
    
    .. versionadded:: 1.5



In [14]:
# first, check Primary Type column is String or not
rc.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: string (nullable = true)
 |-- Domestic: string (nullable = true)
 |-- Beat: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Ward: string (nullable = true)
 |-- Community Area: string (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: string (nullable = true)
 |-- Y Coordinate: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Location: string (nullable = true)



In [24]:
rc.select(lower(col('Primary Type')), upper(col('Primary Type')), substring(col('Primary Type'), 1, 4)).show(5)

+-------------------+-------------------+-----------------------------+
|lower(Primary Type)|upper(Primary Type)|substring(Primary Type, 1, 4)|
+-------------------+-------------------+-----------------------------+
|            battery|            BATTERY|                         BATT|
|              theft|              THEFT|                         THEF|
|              theft|              THEFT|                         THEF|
|          narcotics|          NARCOTICS|                         NARC|
|            assault|            ASSAULT|                         ASSA|
+-------------------+-------------------+-----------------------------+
only showing top 5 rows



------

<p id="numeric"></p>

## Numeric Functions

### Show the oldest date and most recent date

In [25]:
from pyspark.sql.functions import min, max

In [27]:
rc.select(min(col('Date'))).show(1)

+-------------------+
|          min(Date)|
+-------------------+
|2001-01-01 00:00:00|
+-------------------+



In [28]:
rc.select(max(col('Date'))).show(1)

+-------------------+
|          max(Date)|
+-------------------+
|2018-11-11 00:00:00|
+-------------------+



-------

### What is 3 days earlier than the oldest date and 3 days later than the most recent date?

In [29]:
from pyspark.sql.functions import date_add, date_sub

In [30]:
help(date_add)

Help on function date_add in module pyspark.sql.functions:

date_add(start, days)
    Returns the date that is `days` days after `start`
    
    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
    >>> df.select(date_add(df.dt, 1).alias('next_date')).collect()
    [Row(next_date=datetime.date(2015, 4, 9))]
    
    .. versionadded:: 1.5



In [None]:
# 3 days earlier than the oldest date
rc.select(date_sub(min(col('Date')), 3)).show(1)

In [None]:
# 3 days later than the recent date
rc.select(date_add(max(col('Date')), 3)).show(1)

------

<p id="dates"></p>

# 2) Working with Dates

In [5]:
from pyspark.sql.functions import to_date, to_timestamp, lit

### Parsing different strings to Date and Timestamp

### 2019-12-25 13:30:00

In [7]:
df = spark.createDataFrame([('2019-12-25 13:30:00', )], ['Christmas']) #value, column name

In [8]:
df.show(1)

+-------------------+
|          Christmas|
+-------------------+
|2019-12-25 13:30:00|
+-------------------+



In [10]:
# parse to date and timestamp
df.select(to_date(col('Christmas'), 'yyyy-MM-dd HH:mm:ss')).show()

+-------------------------------------------+
|to_date(`Christmas`, 'yyyy-MM-dd HH:mm:ss')|
+-------------------------------------------+
|                                 2019-12-25|
+-------------------------------------------+



In [11]:
df.select(to_timestamp(col('Christmas'),  'yyyy-MM-dd HH:mm:ss')).show()

+------------------------------------------------+
|to_timestamp(`Christmas`, 'yyyy-MM-dd HH:mm:ss')|
+------------------------------------------------+
|                             2019-12-25 13:30:00|
+------------------------------------------------+



### 25/Dec/2019 13:30:00

In [13]:
df = spark.createDataFrame([('25/Dec/2019 13:30:00', )], ['Christmas'])

In [14]:
df.show()

+--------------------+
|           Christmas|
+--------------------+
|25/Dec/2019 13:30:00|
+--------------------+



In [15]:
# parse to date and timestamp

In [18]:
df.select(to_date(col('Christmas'), 'dd/MMM/yyyy HH:mm:ss')).show()

+--------------------------------------------+
|to_date(`Christmas`, 'dd/MMM/yyyy HH:mm:ss')|
+--------------------------------------------+
|                                  2019-12-25|
+--------------------------------------------+



In [19]:
df.select(to_timestamp(col('Christmas'), 'dd/MMM/yyyy HH:mm:ss')).show()

+-------------------------------------------------+
|to_timestamp(`Christmas`, 'dd/MMM/yyyy HH:mm:ss')|
+-------------------------------------------------+
|                              2019-12-25 13:30:00|
+-------------------------------------------------+



### 12/25/2019 01:30:00 PM

In [23]:
df = spark.createDataFrame([('12/25/2019 01:30:00 PM' ,)], ['Christmas'])
df.show(1, truncate=False)

+----------------------+
|Christmas             |
+----------------------+
|12/25/2019 01:30:00 PM|
+----------------------+



In [21]:
# parse to date and timestamp

In [25]:
df.select(to_date(col('Christmas'), 'MM/dd/yyyy hh:mm:ss a')).show()

+---------------------------------------------+
|to_date(`Christmas`, 'MM/dd/yyyy hh:mm:ss a')|
+---------------------------------------------+
|                                   2019-12-25|
+---------------------------------------------+



In [26]:
df.select(to_timestamp(col('Christmas'), 'MM/dd/yyyy hh:mm:ss a')).show()

+--------------------------------------------------+
|to_timestamp(`Christmas`, 'MM/dd/yyyy hh:mm:ss a')|
+--------------------------------------------------+
|                               2019-12-25 13:30:00|
+--------------------------------------------------+



------

In [27]:
new_rc = spark.read.csv('../data/chicago_crimes.csv', header=True)

In [29]:
new_rc.show(2, truncate=False)

+--------+-----------+----------------------+-------------------+----+------------+-----------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+----------------------+------------+-------------+-----------------------------+
|ID      |Case Number|Date                  |Block              |IUCR|Primary Type|Description            |Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|Updated On            |Latitude    |Longitude    |Location                     |
+--------+-----------+----------------------+-------------------+----+------------+-----------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+----------------------+------------+-------------+-----------------------------+
|10224738|HY411648   |09/05/2015 01:30:00 PM|043XX S WOOD ST    |0486|BATTERY     |DOMESTIC BATTERY SIM

---------

<p id="user"></p>

# 3) User Defined Functions

<p id="join"></p>

# 4) Working with Joins

<p id="join"></p>

# 5) Challenges