https://github.com/mahmoudparsian/pyspark-tutorial

## 1. Start

In [1]:
import findspark
findspark.init()

In [2]:
!echo $PYSPARK_PYTHON
#!pip install pyspark

/Users/shiwang/anaconda3/bin/python3


In [3]:
import pyspark
from pyspark.sql import SQLContext, SparkSession
from pyspark import SparkContext

In [4]:
sc = SparkContext()
sc

In [5]:
spark = SparkSession(sc)
spark

In [6]:
print(sc.version)

2.2.1


## 2. Basics

### RRD and its transformations

In [7]:
data = sc.parallelize(
    [('Amber', 22), ('Alfred', 23), ('Skye',4), ('Albert', 12), ('Amber', 9)]) #schema-less, allow diff types

In [8]:
data.take(2)

[('Amber', 22), ('Alfred', 23)]

In [9]:
#map
data.map(lambda row: (row[0], row[0])).take(2)

[('Amber', 'Amber'), ('Alfred', 'Alfred')]

In [10]:
#filter
data.filter(lambda row: row[1] < 20).take(3)

[('Skye', 4), ('Albert', 12), ('Amber', 9)]

In [11]:
#flatMap
data.flatMap(lambda row: (row[1], row[1]+1)).take(12)

[22, 23, 23, 24, 4, 5, 12, 13, 9, 10]

In [12]:
#distinct
data.map(lambda row: row[0]).distinct().collect()

['Amber', 'Skye', 'Alfred', 'Albert']

In [13]:
# Join
data_2 = sc.parallelize([('Amber', 2222), ('Alfred', 23)]) 
rdd = data.leftOuterJoin(data_2)
rdd.collect()

[('Amber', (22, 2222)),
 ('Amber', (9, 2222)),
 ('Skye', (4, None)),
 ('Alfred', (23, 23)),
 ('Albert', (12, None))]

In [14]:
rdd = data.join(data_2)
rdd.collect()

[('Amber', (22, 2222)), ('Amber', (9, 2222)), ('Alfred', (23, 23))]

In [15]:
#intersection
rdd = data.intersection(data_2)
rdd.collect()

[('Alfred', 23)]

### RRD and its actions

In [16]:
#count
data.count()

5

In [17]:
#sample
data.takeSample(num=3, withReplacement=False)

[('Skye', 4), ('Amber', 9), ('Amber', 22)]

In [18]:
#reduce
data.map(lambda row: row[1]).reduce(lambda x, y: x + y)

70

In [19]:
data.map(lambda row: row[1]).reduce(lambda x, y: x / y)

0.0022141706924315623

In [20]:
#reduce by key
data.reduceByKey(lambda x, y: x + y).collect() # note the 1st one turple

[('Amber', 31), ('Skye', 4), ('Alfred', 23), ('Albert', 12)]

In [21]:
# count by key
data.countByKey().items()

dict_items([('Amber', 2), ('Alfred', 1), ('Skye', 1), ('Albert', 1)])

In [22]:
# Save and Read
data.saveAsTextFile('./temp/data_key.txt') # partitioned into 8 pieces

In [23]:
def parseInput(row):
    import re
    
    pattern = re.compile(r'\(\'([A-Za-z]*)\', ([0-9]*)\)')
    row_split = pattern.split(row)
    
    return (row_split[1], int(row_split[2]))
    
data_key_reread = sc \
    .textFile('./temp/data_key.txt') \
    .map(parseInput)
    
data_key_reread.collect()

[('Amber', 22), ('Alfred', 23), ('Skye', 4), ('Albert', 12), ('Amber', 9)]

In [24]:
# for each
def function(x):
    print(x) #print to terminal
    
data.foreach(function)

### DataFrames

In [25]:
stringJSONRDD = sc.parallelize((""" 
  { "id": "123",
    "name": "Katie",
    "age": 19,
    "eyeColor": "brown"
  }""",
   """{
    "id": "234",
    "name": "Michael",
    "age": 22,
    "eyeColor": "green"
  }""", 
  """{
    "id": "345",
    "name": "Michael",
    "age": 23,
    "eyeColor": "green"
  }""")
)

In [26]:
# Create DataFrame from JSON
swimmersJSON = spark.read.json(stringJSONRDD)

In [27]:
#create table
swimmersJSON.createOrReplaceTempView("swimmersJSON")

In [28]:
#schema - auto-inferred
swimmersJSON.printSchema()

root
 |-- age: long (nullable = true)
 |-- eyeColor: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



In [29]:
#show
swimmersJSON.show()

+---+--------+---+-------+
|age|eyeColor| id|   name|
+---+--------+---+-------+
| 19|   brown|123|  Katie|
| 22|   green|234|Michael|
| 23|   green|345|Michael|
+---+--------+---+-------+



In [30]:
# SQL query
sql_query = "select * from swimmersJSON"
spark.sql(sql_query).collect()

[Row(age=19, eyeColor='brown', id='123', name='Katie'),
 Row(age=22, eyeColor='green', id='234', name='Michael'),
 Row(age=23, eyeColor='green', id='345', name='Michael')]

In [31]:
# DataFrame Query
swimmersJSON.select(swimmersJSON.age, swimmersJSON.eyeColor) \
            .filter(swimmersJSON.age > 20) \
            .show()

+---+--------+
|age|eyeColor|
+---+--------+
| 22|   green|
| 23|   green|
+---+--------+



In [32]:
# DataFrame Query
swimmersJSON.select(['age', 'eyeColor']) \
            .where(swimmersJSON.age > 20) \
            .show()

+---+--------+
|age|eyeColor|
+---+--------+
| 22|   green|
| 23|   green|
+---+--------+



In [33]:
import pyspark.sql.functions as fn

swimmersJSON.agg(
    fn.count('id').alias('count'),
    fn.countDistinct('id').alias('distinct')
).show()

+-----+--------+
|count|distinct|
+-----+--------+
|    3|       3|
+-----+--------+



In [34]:
# drop duplicated
df = swimmersJSON.dropDuplicates(
    subset=[c for c in swimmersJSON.columns \
            if c not in ['id','age']]) # exclude id column, drop duplicates
df.show()

+---+--------+---+-------+
|age|eyeColor| id|   name|
+---+--------+---+-------+
| 22|   green|234|Michael|
| 19|   brown|123|  Katie|
+---+--------+---+-------+



In [35]:
swimmersJSON.groupBy('eyeColor').count().show()

+--------+-----+
|eyeColor|count|
+--------+-----+
|   green|    2|
|   brown|    1|
+--------+-----+



In [36]:
swimmersJSON.describe('Age').show()

+-------+------------------+
|summary|               Age|
+-------+------------------+
|  count|                 3|
|   mean|21.333333333333332|
| stddev| 2.081665999466133|
|    min|                19|
|    max|                23|
+-------+------------------+



In [37]:
swimmersJSON.agg({'Age':'mean'}).show()

+------------------+
|          avg(Age)|
+------------------+
|21.333333333333332|
+------------------+



In [38]:
# Histrogram
hists = swimmersJSON.select('Age').rdd.flatMap(lambda row: row).histogram(2)

In [39]:
hists # size of bins calculated by workers before returning to driver

([19, 21, 23], [1, 2])

# 3. MLlib

# 4. TensorFrames

In [40]:
import tensorflow as tf
import tensorframes as tfs
from pyspark.sql import Row

ModuleNotFoundError: No module named 'tensorframes'