In [None]:
import sys
sys.path

# Check python version
!which python

# Check for pyspark
!pip show findspark

### Using pip

- [Reference](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)
- Create a new  `conda` environment

  ```bash
  conda create --name pyspark python=3.7
  conda activate pyspark
  conda deactivate
  ```

- Install specific `pyspark` version
  - `pip index versions pyspark`
  - `pip install pyspark==2.4.8`
- Install addons
  - Spark SQL
    - `pip install pyspark[sql]`
  - Pandas API with plotly
    - `pip install pyspark[pandas_on_spark] plotly`
  - To work with Jupyter notebooks
    - `conda install -c conda-forge --name pyspark ipykernel -y`
  - If want to use local spark library
    - `conda install -c conda-forge findspark -y`

In [None]:
# If using local spark installation
import findspark
findspark.init()

In [None]:
import os
# If you are using pip installed pySpark module then this value should not be set in the environment
print(os.environ['SPARK_HOME'])

# To delete an evironment variable
!del os.environ['SPARK_HOME']

In [None]:
# import pyspark
from pyspark import SparkContext
sc=SparkContext()

In [None]:
# Create a sample RDD
nums=sc.parallelize([1,2,3,4])
nums.count()

In [None]:
nums1=nums.collect()
for num in nums1:
    print('%i ' % (num))

In [None]:
# Create a spark SQL session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('example').master("local[*]").getOrCreate()

In [None]:
# aggregate function
listRDD = sc.parallelize([1,2,3,4, 5, 6], 2)

# seqOp is equivalent to local combiner
seqOp = (lambda local_result, list_element: (local_result[0] + list_element, local_result[1] + 1))

# combOp is equivalent to reducer
combOp = (lambda some_local_result, another_local_result: (some_local_result[0] + another_local_result[0], some_local_result[1] + another_local_result[1]) )

listRDD.aggregate( (0, 0), seqOp, combOp)

In [None]:
print(sc.version)