In [1]:
from pyspark.accumulators import AccumulatorParam

#### Creating Custom Accumulator

In [3]:
class VectorAccumulatorParam(AccumulatorParam):
    
    def zero(self, value):
        '''
        Initialize the accumulator when first created
        '''
        return [0.0] * len(value)
    
    def addInPlace(self, v1, v2):
        '''
        Add to the current accumulated vector values
        '''
        for i in range(len(v1)):
            v1[i] += v2[i]
        return v1

In [4]:
vector_accum = sc.accumulator([10.0,20.0,30.0],VectorAccumulatorParam())
vector_accum

Accumulator<id=0, value=[10.0, 20.0, 30.0]>

In [5]:
vector_accum += [1, 2, 3]
vector_accum

Accumulator<id=0, value=[11.0, 22.0, 33.0]>

####  Joins in Spark

In [8]:
valuesA = [('John', 100000), ('James', 150000), ('Emily', 65000), ('Nina', 200000)]
tableA = spark.createDataFrame(valuesA, ['name', 'salary'])

valuesB = [('James', 2), ('Emily',3), ('Darth Vader', 5), ('Princess Leia', 6),]
tableB = spark.createDataFrame(valuesB, ['name', 'employee_id'])

In [9]:
tableA.show()

+-----+------+
| name|salary|
+-----+------+
| John|100000|
|James|150000|
|Emily| 65000|
| Nina|200000|
+-----+------+



In [10]:
tableB.show()

+-------------+-----------+
|         name|employee_id|
+-------------+-----------+
|        James|          2|
|        Emily|          3|
|  Darth Vader|          5|
|Princess Leia|          6|
+-------------+-----------+



In [11]:
# Inner Join
tableA.join(tableB, tableA.name == tableB.name).show()

+-----+------+-----+-----------+
| name|salary| name|employee_id|
+-----+------+-----+-----------+
|James|150000|James|          2|
|Emily| 65000|Emily|          3|
+-----+------+-----+-----------+



In [12]:
# Left Outer Join
tableA.join(tableB, tableA.name == tableB.name, how='left').show()

+-----+------+-----+-----------+
| name|salary| name|employee_id|
+-----+------+-----+-----------+
|James|150000|James|          2|
| John|100000| null|       null|
|Emily| 65000|Emily|          3|
| Nina|200000| null|       null|
+-----+------+-----+-----------+



In [13]:
# Right Outer Join
tableB.join(tableA, tableA.name == tableB.name, how='right').show()

+-----+-----------+-----+------+
| name|employee_id| name|salary|
+-----+-----------+-----+------+
|James|          2|James|150000|
| null|       null| John|100000|
|Emily|          3|Emily| 65000|
| null|       null| Nina|200000|
+-----+-----------+-----+------+



In [14]:
# Full Outer Join
tableB.join(tableA, tableA.name == tableB.name, how='full').show()

+-------------+-----------+-----+------+
|         name|employee_id| name|salary|
+-------------+-----------+-----+------+
|        James|          2|James|150000|
|         null|       null| John|100000|
|Princess Leia|          6| null|  null|
|        Emily|          3|Emily| 65000|
|         null|       null| Nina|200000|
|  Darth Vader|          5| null|  null|
+-------------+-----------+-----+------+

