## Spark Demonstration

Install:

pip install pyspark

pip install findspark


Will need to set your SPARK home installation.  Second example is from a venv

export SPARK_HOME=/home/jbslanka/spark/spark-3.0.0-preview2-bin-hadoop3.2

export SPARK_HOME=/home/jbslanka/mypython/lib/python3.8/site-packages/pyspark

May need to execute:

export PYSPARK_PYTHON=python3



In [1]:
import findspark
findspark.init()



In [2]:
#Source: https://www.sicara.ai/blog/2017-05-02-get-started-pyspark-jupyter-notebook-3-minutes
#Estimating pi using Monto Carlo: https://www.geeksforgeeks.org/estimating-value-pi-using-monte-carlo/

import pyspark
import random

sc = pyspark.SparkContext(appName="Pi")
num_samples = 100000000

def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1

count = sc.parallelize(range(0, num_samples)).filter(inside).count()

pi = 4 * count / num_samples
print(pi)

sc.stop()

3.141637


In [3]:
from pyspark.sql import SparkSession
sc = pyspark.SparkContext(appName="demo")
sess = SparkSession(sc)



In [4]:
# Read in a text file and show the top 10 lines
strings = sess.read.text("data/README.md")
strings.show(10, truncate=False)

+--------------------------------------------------------------------------------+
|value                                                                           |
+--------------------------------------------------------------------------------+
|# Apache Spark                                                                  |
|                                                                                |
|Spark is a unified analytics engine for large-scale data processing. It provides|
|high-level APIs in Scala, Java, Python, and R, and an optimized engine that     |
|supports general computation graphs for data analysis. It also supports a       |
|rich set of higher-level tools including Spark SQL for SQL and DataFrames,      |
|MLlib for machine learning, GraphX for graph processing,                        |
|and Structured Streaming for stream processing.                                 |
|                                                                                |
|<ht

In [5]:
# Count the number of lines in the file
strings.count()


109

In [6]:
#Now create a new data frame with only lines containing spark and produce the count
filtered = strings.filter(strings.value.contains("Spark"))
filtered.show(5,truncate=False)
filtered.count()

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                                                                                                               |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|# Apache Spark                                                                                                    

20

In [7]:
df = sess.read.json("data/people.json")
df.show()




+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [8]:
df.printSchema()


root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [9]:
df.select("name").show()
df.select(df['name'], df['age'] + 1).show()
df.filter(df['age'] > 21).show()
df.groupBy("age").count().show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|     null|
|   Andy|       31|
| Justin|       20|
+-------+---------+

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



In [10]:
# SQL view
df.createOrReplaceTempView("people")
sqlDF = sess.sql("SELECT * FROM people")
sqlDF.show()

sqlDF = sess.sql("SELECT * FROM people order by name")
sqlDF.show()

sqlDF = sess.sql("SELECT age,count(*) FROM people group by age")
sqlDF.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

+----+-------+
| age|   name|
+----+-------+
|  30|   Andy|
|  19| Justin|
|null|Michael|
+----+-------+

+----+--------+
| age|count(1)|
+----+--------+
|  19|       1|
|null|       1|
|  30|       1|
+----+--------+



In [11]:
sc.stop()