# Create Spark session

In [3]:
import findspark
findspark.init()

# Create a spark-session (akin to what pyspark provides when it is started)
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark

# Checking Spark session options

In [4]:
sc = spark.sparkContext
sc.getConf().getAll()

[('spark.history.kerberos.keytab', 'none'),
 ('spark.eventLog.enabled', 'true'),
 ('spark.driver.port', '44428'),
 ('spark.driver.appUIAddress', 'http://slalomdsvm:4040'),
 ('spark.history.ui.port', '18081'),
 ('spark.driver.memory', '512M'),
 ('spark.driver.extraLibraryPath',
  '/usr/hdp/current/hadoop-client/lib/native:/usr/hdp/current/hadoop-client/lib/native/Linux-amd64-64'),
 ('spark.history.fs.cleaner.interval', '7d'),
 ('spark.app.id', 'application_1596825752622_0001'),
 ('spark.shuffle.io.serverThreads', '128'),
 ('spark.yarn.historyServer.address', 'slalomdsvm:18081'),
 ('spark.sql.streaming.streamingQueryListeners', ''),
 ('spark.executor.extraLibraryPath',
  '/usr/hdp/current/hadoop-client/lib/native:/usr/hdp/current/hadoop-client/lib/native/Linux-amd64-64'),
 ('spark.sql.statistics.fallBackToHdfs', 'true'),
 ('spark.executorEnv.PYTHONPATH',
  '{{PWD}}/pyspark.zip<CPS>{{PWD}}/py4j-0.10.7-src.zip'),
 ('spark.shuffle.file.buffer', '1m'),
 ('spark.history.provider',
  'org.apac

# Imports

In [5]:
import pandas as pd

%matplotlib inline

# Create and see dataframes

In [18]:
col_names = ['id', 'first_name', 'last_name']
rows = [
    (1, 'John', 'Doe'),
    (2, 'Jane', 'Doe'),
    (3, 'Herbie', 'Hancock'),
    (4, 'Erin', 'brockovich'),        
]

df1 = spark.createDataFrame(rows, col_names)

In [19]:
col_names = ['id', 'number_sox']
rows = [
    (1, 24),
    (2, 30),
    (3, 29),
    (4, 40),        
]

df2 = spark.createDataFrame(rows, col_names)

# See the dataframes

In [20]:
df1.printSchema()

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)



In [21]:
df2.printSchema()

root
 |-- id: long (nullable = true)
 |-- number_sox: long (nullable = true)



Show the DF:

  * Asking for `df1` only shows the reference to the object. This is beacuse of Spark's lazy evaluation: the dataframe has not been created yet (`createDataFrame()` is a transformation).
  * The dataframe will only be created when we apply an action to it like `show()`.
  * Note the difference in duration between the 2 cells.

In [22]:
df1

DataFrame[id: bigint, first_name: string, last_name: string]

In [23]:
df1.show()

+---+----------+----------+
| id|first_name| last_name|
+---+----------+----------+
|  1|      John|       Doe|
|  2|      Jane|       Doe|
|  3|    Herbie|   Hancock|
|  4|      Erin|brockovich|
+---+----------+----------+



In [24]:
df2

DataFrame[id: bigint, number_sox: bigint]

In [25]:
df2.show()

+---+----------+
| id|number_sox|
+---+----------+
|  1|        24|
|  2|        30|
|  3|        29|
|  4|        40|
+---+----------+



# Joining dataframes

In [27]:
df = df1.join(df2, 'id', how = 'inner')

df.show()

+---+----------+----------+----------+
| id|first_name| last_name|number_sox|
+---+----------+----------+----------+
|  1|      John|       Doe|        24|
|  3|    Herbie|   Hancock|        29|
|  2|      Jane|       Doe|        30|
|  4|      Erin|brockovich|        40|
+---+----------+----------+----------+



# Selecting columns

Get a column object with:

In [28]:
df.id

Column<b'id'>

In [36]:
selected_df = df.select('id', 'first_name')

selected_df.show()

+---+----------+
| id|first_name|
+---+----------+
|  1|      John|
|  3|    Herbie|
|  2|      Jane|
|  4|      Erin|
+---+----------+



# Filtering rows

In [37]:
filtered_df = df.filter((df.number_sox <= 25) | (df.number_sox >= 35))

filtered_df.show()

+---+----------+----------+----------+
| id|first_name| last_name|number_sox|
+---+----------+----------+----------+
|  1|      John|       Doe|        24|
|  4|      Erin|brockovich|        40|
+---+----------+----------+----------+



# Randomizing and ordering rows

In [41]:
from pyspark.sql.functions import rand

randomized_df = df.orderBy(rand())
randomized_df.show()

+---+----------+----------+----------+
| id|first_name| last_name|number_sox|
+---+----------+----------+----------+
|  2|      Jane|       Doe|        30|
|  4|      Erin|brockovich|        40|
|  1|      John|       Doe|        24|
|  3|    Herbie|   Hancock|        29|
+---+----------+----------+----------+



In [42]:
randomized_df.orderBy('id').show()

+---+----------+----------+----------+
| id|first_name| last_name|number_sox|
+---+----------+----------+----------+
|  1|      John|       Doe|        24|
|  2|      Jane|       Doe|        30|
|  3|    Herbie|   Hancock|        29|
|  4|      Erin|brockovich|        40|
+---+----------+----------+----------+



In [51]:
randomized_df \
.orderBy('last_name', 'number_sox') \
.show()

+---+----------+----------+----------+
| id|first_name| last_name|number_sox|
+---+----------+----------+----------+
|  1|      John|       Doe|        24|
|  2|      Jane|       Doe|        30|
|  3|    Herbie|   Hancock|        29|
|  4|      Erin|brockovich|        40|
+---+----------+----------+----------+



In [52]:
from pyspark.sql.functions import desc

randomized_df \
.orderBy('last_name', desc('number_sox')) \
.show()

+---+----------+----------+----------+
| id|first_name| last_name|number_sox|
+---+----------+----------+----------+
|  2|      Jane|       Doe|        30|
|  1|      John|       Doe|        24|
|  3|    Herbie|   Hancock|        29|
|  4|      Erin|brockovich|        40|
+---+----------+----------+----------+



# Aggregation

# Expanding a "list" column into rows

# Collapsing multiple rows into a "list" column

# Applying a lambda function to a column

# One-hot-encoding a column

# Loading data from a CSV file on HDFS into a Spark dataframe

# Sampling rows

In [None]:
count
sample
count

# Writing a Spark dataframe to a parquet file on HDFS

In [None]:
 df.coalesce(1).

# Collecting a Spark dataframe into a "regular" pandas dataframe