# Description
Basic examples of pyspark code in action

### Using pip

- [Reference](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)
- Create a new  `conda` environment

  ```bash
  conda create --name pyspark python=3.7
  conda activate pyspark
  conda deactivate
  ```

- Install specific `pyspark` version
  - `pip index versions pyspark`
  - `pip install pyspark==2.4.8`
  - `conda install -c conda-forge pyspark`
- Install addons
  - Spark SQL
    - `pip install pyspark[sql]`
  - Pandas API with plotly
    - `pip install pyspark[pandas_on_spark] plotly`
  - To work with Jupyter notebooks
    - `conda install -c conda-forge --name pyspark ipykernel -y`
  - If want to use local spark library
    - `conda install -c conda-forge findspark -y`

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('PySpark')\
    .master("local[*]")\
    .config("spark.sql.shuffle.partitions", 2)\
    .getOrCreate()

# If your data volume is small enough then changing the value for shuffle partitions can help improve performance significantly

sc = spark.sparkContext

# Check spark version
print(sc.version)
print(spark.sparkContext._conf.get("spark.sql.shuffle.partitions"))

In [None]:
# Example of Spark SQL operation. Create 2 sample dataframe and join on a common key
employee = spark.createDataFrame(["Alice","Bob","John"], "string").toDF("name")
department = spark.createDataFrame([("Alice", "Finance"),("Bob", "HR"),("John", "IT")]).toDF("name", "department")

employee.join(department, "name").show()

In [None]:
# Reading JSON data
import os
jsonpath = os.environ["HOME"]+"/Desktop/tmp/training/spark-advanced/datasets/spark-ml/spark-ml-input.json"

inputdf = spark.read.json(jsonpath)
inputdf.show(2)

# print(inputdf.rdd.getNumPartitions())

# repartitiondf = inputdf.repartition(5)
# print(repartitiondf.rdd.getNumPartitions())

# print(inputdf.count())

In [None]:
# Loading Sample data
shopping_data = \
[('Alex','2018-10-10','Paint',80),('Alex','2018-04-02','Ladder',20),('Alex','2018-06-22','Stool',20),\
('Alex','2018-12-09','Vacuum',40),('Alex','2018-07-12','Bucket',5),('Alex','2018-02-18','Gloves',5),\
('Alex','2018-03-03','Brushes',30),('Alex','2018-09-26','Sandpaper',10)]

from pyspark.sql.types import DateType
import pyspark.sql.functions as F

df = spark.createDataFrame(shopping_data, ['name','date','product','price']).withColumn('date',F.col('date').cast(DateType()))

df.printSchema()

In [None]:
# Basic aggregations
from pyspark.sql.functions import count, sum

df.agg(\
    count('*').alias('cnt'), \
    sum('price').alias('sm')\
    ).show()

# Group by aggregations
inputdf.groupBy('color')\
    .agg(count('*').alias('cnt')\
    ).show(5)

In [None]:
# Selecting frame in a windowing function
# reference : https://towardsdatascience.com/spark-sql-102-aggregations-and-window-functions-9f829eaa7549
from pyspark.sql.window import Window

w_1 = Window.partitionBy('name').orderBy('date')
w_2 = Window.partitionBy('name').orderBy('date').rowsBetween(-1, Window.currentRow) #currentRow can be substituted with '0' as well

# Adding current row and all the ones before it
df.withColumn('sum_1', sum('price').over(w_1)).show()

# Adding current row and 1 before
df.withColumn('sum_1', sum('price').over(w_2)).show()


In [None]:
# Selecting frame continued
w_3 = Window.partitionBy('name').orderBy('date').rowsBetween(Window.unboundedPreceding, 0) #this is equivalent to w_1
w_4 = Window.partitionBy('name').orderBy('date').rowsBetween(0, Window.unboundedFollowing) #this is opposite of w_3

df.withColumn('sum_1', sum('price').over(w_3)).show()

df.withColumn('sum_1', sum('price').over(w_4)).show()

In [None]:
# Define a window
from pyspark.sql.window import Window
import pyspark.sql.functions as F

w0 = Window.partitionBy('name')

# Rank vs Dense Rank
df.withColumn('price_rank',\
    F.dense_rank()\
    .over(w0.orderBy(F.col('price')\
    .desc())))\
    .show()


df.withColumn('price_rank',\
    F.rank()\
    .over(w0.orderBy(F.col('price')\
    .desc())))\
    .show()


In [None]:
# Distribute the data into buckets
df.withColumn('price_bucket',\
    F.ntile(4)\
    .over(w0.orderBy(F.col('price')\
    .desc())))\
    .show()

In [None]:
# Using lead and lag functions
df.withColumn('days_since_last', \
        F.datediff('date', F.lag('date', 1)\
            .over(w0.orderBy(F.col('date')))\
        )
    )\
    .withColumn('days_before_next', \
        F.datediff(F.lead('date', 1)\
            .over(w0.orderBy(F.col('date')))\
        , 'date'
        )
    )\
    .show()


In [None]:
# Using collect_set, collect_list
df.withColumn('products'\
    , F.collect_set('product')\
        .over(w0.partitionBy('price'))
    ).show()

In [None]:
# Calculate moving average of sum over last 30 days
days = lambda i: i * 86400 # 86400 seconds in a day  

df.withColumn('unix_time',F.col('date').cast('timestamp').cast('long'))\
    .withColumn('moving_avg', \
        F.avg('price')\
            .over(w0.orderBy(F.col('unix_time')).rangeBetween(-days(30), 0)\
        )\
    )\
    .withColumn('days_since_last', \
        F.datediff('date', F.lag('date', 1)\
            .over(w0.orderBy(F.col('date')))\
        )
    ).show()

In [None]:
# Trend Analysis Example
import os
csvdf1 = spark.read\
    .options(header = True, inferSchema = True, delimiter = ',', dateFormat = 'MM/dd/yyyy')\
    .csv(os.environ['HOME']+"/tmp/sample-dataset/salary.csv")

csvdf1.show(2)
csvdf1.printSchema()

#Parse the date
csvdf2 = csvdf1.withColumn('saldt_parse', F.to_date('saldt', format='MM/dd/yyyy'))

csvdf2.show(2)

In [None]:
from pyspark.sql.window import Window
import pyspark.sql.functions as F

wd = Window.partitionBy('empname').orderBy(F.col('saldt_parse').asc())

csvdf2.withColumn(\
    'trend',  \
         F.when(\
             (F.col('salary') - F.coalesce(F.lag('salary', 1).over(wd), F.lit(0))) > 0, 'UP')\
         .otherwise('DOWN')
)\
.withColumn(\
    'diff',  \
            (F.col('salary') - F.coalesce(F.lag('salary', 1).over(wd), F.lit(0)))\
).show()

In [None]:
# TODO Streaming in PySpark