In [1]:
### Estimating the value of Pi

In [2]:
import random
from pyspark import SparkContext

# note: the spark context exists as sc

def inside(p):
    x, y = random.random(), random.random()
    return x*x + y*y < 1

NUM_SAMPLES = 1000000

count = sc.parallelize(range(0, NUM_SAMPLES)).filter(inside).count()

print("Pi is roughly %f" % (4.0 * count / NUM_SAMPLES))

Pi is roughly 3.140424


In [None]:
### Reading a csv file

In [11]:
dummy_csv = spark.read.option('inferSchema', 'true').option('header', 'true').csv('data/data.csv')

In [12]:
dummy_csv.show(3)

+------+---+------+-------+-----+
|  name|age|height|   city|state|
+------+---+------+-------+-----+
|angela| 32|   160|Concord|   CA|
|   joe| 43|   156| Haward|   CA|
| elvis| 27|   162|Ontario|   CA|
+------+---+------+-------+-----+
only showing top 3 rows



In [14]:
# sorting by their height
dummy_csv.sort("height").show(4)

+------+---+------+-------+-----+
|  name|age|height|   city|state|
+------+---+------+-------+-----+
|   joe| 43|   156| Haward|   CA|
|angela| 32|   160|Concord|   CA|
| elvis| 27|   162|Ontario|   CA|
| nancy| 34|   170| Pomona|   CA|
+------+---+------+-------+-----+



In [15]:
### Running SQL Statement

In [16]:
# Creating a table from dataframe to run sql
dummy_csv.createOrReplaceTempView("basic_table")

In [17]:
sql_code = spark.sql("SELECT name, age, height from basic_table WHERE age >= 32")

In [18]:
sql_code.show()

+------+---+------+
|  name|age|height|
+------+---+------+
|angela| 32|   160|
|   joe| 43|   156|
| nancy| 34|   170|
+------+---+------+



In [20]:
### Using groupby to find the number of people who live in the state of california
sql_code = dummy_csv.groupBy('state').count()
sql_code.show()

+-----+-----+
|state|count|
+-----+-----+
|   CA|    4|
+-----+-----+



In [35]:
dummy_csv.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- height: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)



In [37]:
# Pyspark with graphframes
#/opt/spark/bin/spark-shell --packages graphframes:graphframes:0.8.2-spark3.2-s_2.12

ModuleNotFoundError: No module named 'sparkdl'