## Spark Session
To get all the "various Spark parameters as key-value pairs" for a SparkSession, “The entry point to programming Spark with the Dataset and DataFrame API," run the following (this is using spark python api, scala would be very similar):

In [211]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .master("local")
         .appName("Spark session")
         .config("spark.sql.shuffle.partitions", 100)  # custom settings
         .getOrCreate())

In [76]:
spark.sparkContext.getConf().getAll()

[('spark.master', 'local'),
 ('spark.driver.port', '53216'),
 ('spark.app.id', 'local-1569014349990'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.driver.host', '192.168.3.105'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.name', 'Spark session'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.sql.shuffle.partitions', '100')]

### Create a DataFrame/Dataset from a collection (e.g. list or set)

https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.SparkSession.createDataFrame

In [80]:
from pyspark.sql.types import IntegerType

# dataframe from list
mylist = [1, 2, 3, 4]
spark.createDataFrame(mylist, IntegerType()).show()

+-----+
|value|
+-----+
|    1|
|    2|
|    3|
|    4|
+-----+



In [84]:
from pyspark.sql.types import FloatType

# dataframe from set
myset = (1., 2., 3., 4.)
spark.createDataFrame(myset, FloatType()).show()

+-----+
|value|
+-----+
|  1.0|
|  2.0|
|  3.0|
|  4.0|
+-----+



In [85]:
# create from list of tuples
mylist = [('Alice', 34), ('Bob', 25), ('Simon', 41)]

spark.createDataFrame(mylist, ['name', 'age']).show()

+-----+---+
| name|age|
+-----+---+
|Alice| 34|
|  Bob| 25|
|Simon| 41|
+-----+---+



In [86]:
# create from a dictionary
mydict = [
    {'name': 'Alice', 'age': 34}, 
    {'name': 'Bob',   'age': 25}, 
    {'name': 'Simon', 'age': 41}
]

spark.createDataFrame(mydict).show()



+---+-----+
|age| name|
+---+-----+
| 34|Alice|
| 25|  Bob|
| 41|Simon|
+---+-----+



In [87]:
# the same but without the warning
from pyspark.sql import Row
mydict = [
    {'name': 'Alice', 'age': 34}, 
    {'name': 'Bob',   'age': 25}, 
    {'name': 'Simon', 'age': 41}
]

spark.createDataFrame(Row(**x) for x in mydict).show()

+---+-----+
|age| name|
+---+-----+
| 34|Alice|
| 25|  Bob|
| 41|Simon|
+---+-----+



### Create a DataFrame for a range of numbers
https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.SparkSession.range

In [96]:
spark.range(0, 10, 2).toDF('number').show()

+------+
|number|
+------+
|     0|
|     2|
|     4|
|     6|
|     8|
+------+



### Access the DataFrameReaders

In [100]:
df = (spark
      .read
      .csv('data/2010-summary.csv'))
df.show()

+--------------------+-------------------+-----+
|                 _c0|                _c1|  _c2|
+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United State

### Register User Defined Functions (UDFs).


In [101]:
def power2(value):
    return value * value

df = spark.range(5).toDF('nums')

In [103]:
df.show()

+----+
|nums|
+----+
|   0|
|   1|
|   2|
|   3|
|   4|
+----+



In [108]:
from pyspark.sql.functions import udf
from pyspark.sql.functions import col

power2udf = udf(power2)

df.select(power2udf(col('nums'))).show()

+------------+
|power2(nums)|
+------------+
|           0|
|           1|
|           4|
|           9|
|          16|
+------------+



In [110]:
# register the UDF for SQL
spark.udf.register('power2', power2)

df.selectExpr('power2(nums)').show()

+------------+
|power2(nums)|
+------------+
|           0|
|           1|
|           4|
|           9|
|          16|
+------------+



## Close Session

In [174]:
spark.stop()