#### Custom accumulator

* The zero() function is to initialize the accumulator
* The addInPlace() is the actual counter

In [1]:
import pyspark
from pyspark import SparkCo ntext
from pyspark.sql import SQLContext

from pyspark.sql.session import SparkSession
from pyspark.accumulators import AccumulatorParam

In [2]:
class VectorAccumulatorParam(AccumulatorParam):
    
    def zero(self, value):
        return [0.0] * len(value)

    def addInPlace(self, v1, v2):
        for i in range(len(v1)):
            v1[i] += v2[i]
        
        return v1

In [3]:
sc = SparkContext()
sc;f`

21/12/11 12:45:23 WARN Utils: Your hostname, srimac.local resolves to a loopback address: 127.0.0.1; using 192.168.1.10 instead (on interface en0)
21/12/11 12:45:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/11 12:45:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
vector_accum = sc.accumulator([500.0, 600.0, 700.0], 
                              VectorAccumulatorParam())

vector_accum.value

[500.0, 600.0, 700.0]

In [5]:
vector_accum += [10.1, 20.2, 30.3]

vector_accum.value

[510.1, 620.2, 730.3]

In [6]:
spark = SparkSession(sc)

#### Setting up the Data in Pyspark
(name, score)

In [7]:
data_1 = [('Kareem', 75), 
          ('Daisy', 89), 
          ('Ellen', 92), 
          ('Joel', 67),
          ('Sara', 99)]

table_1 = spark.createDataFrame(data_1, ['name', 'score'])

In [8]:
table_1.show()

+------+-----+
|  name|score|
+------+-----+
|Kareem|   75|
| Daisy|   89|
| Ellen|   92|
|  Joel|   67|
|  Sara|   99|
+------+-----+



In [9]:
data_2 = [('Daisy', 6), 
          ('Ellen',7), 
          ('Sara', 9), 
          ('Boris', 3),
          ('Victoria', 11)]

table_2 = spark.createDataFrame(data_2, ['name', 'id'])

In [10]:
table_2.show()

+--------+---+
|    name| id|
+--------+---+
|   Daisy|  6|
|   Ellen|  7|
|    Sara|  9|
|   Boris|  3|
|Victoria| 11|
+--------+---+



#### Inner join 

In [11]:
inner_join = table_1.join(table_2, 
                          table_1.name == table_2.name)

inner_join.show()

+-----+-----+-----+---+
| name|score| name| id|
+-----+-----+-----+---+
|Daisy|   89|Daisy|  6|
|Ellen|   92|Ellen|  7|
| Sara|   99| Sara|  9|
+-----+-----+-----+---+



#### Left outer join

In [12]:
left_join = table_1.join(table_2, 
                         table_1.name == table_2.name, 
                         how = 'left') 
left_join.show()

+------+-----+-----+----+
|  name|score| name|  id|
+------+-----+-----+----+
|Kareem|   75| null|null|
|  Joel|   67| null|null|
| Daisy|   89|Daisy|   6|
| Ellen|   92|Ellen|   7|
|  Sara|   99| Sara|   9|
+------+-----+-----+----+



#### Right outer join

In [13]:
left_join = table_1.join(table_2, 
                         table_1.name == table_2.name, 
                         how = 'right')

left_join.show()

+-----+-----+--------+---+
| name|score|    name| id|
+-----+-----+--------+---+
| null| null|Victoria| 11|
| null| null|   Boris|  3|
|Daisy|   89|   Daisy|  6|
|Ellen|   92|   Ellen|  7|
| Sara|   99|    Sara|  9|
+-----+-----+--------+---+



#### Full outer join

In [14]:
full_outer_join = table_1.join(table_2, 
                               table_1.name == table_2.name, 
                               how = 'full')

full_outer_join.show()

+------+-----+--------+----+
|  name|score|    name|  id|
+------+-----+--------+----+
|  null| null|Victoria|  11|
|Kareem|   75|    null|null|
|  Joel|   67|    null|null|
|  null| null|   Boris|   3|
| Daisy|   89|   Daisy|   6|
| Ellen|   92|   Ellen|   7|
|  Sara|   99|    Sara|   9|
+------+-----+--------+----+

